polars_core/frame/mod.rs
1#![allow(unsafe_op_in_unsafe_fn)]
2//! DataFrame module.
3use std::sync::OnceLock;
4use std::{mem, ops};
5
6use arrow::datatypes::ArrowSchemaRef;
7use polars_row::ArrayRef;
8use polars_schema::schema::ensure_matching_schema_names;
9use polars_utils::UnitVec;
10use polars_utils::itertools::Itertools;
11use rayon::prelude::*;
12
13use crate::chunked_array::flags::StatisticsFlags;
14#[cfg(feature = "algorithm_group_by")]
15use crate::chunked_array::ops::unique::is_unique_helper;
16use crate::prelude::*;
17#[cfg(feature = "row_hash")]
18use crate::utils::split_df;
19use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};
20use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};
21
22#[cfg(feature = "dataframe_arithmetic")]
23mod arithmetic;
24pub mod builder;
25mod chunks;
26pub use chunks::chunk_df_for_writing;
27pub mod column;
28pub mod explode;
29mod from;
30#[cfg(feature = "algorithm_group_by")]
31pub mod group_by;
32pub(crate) mod horizontal;
33#[cfg(feature = "proptest")]
34pub mod proptest;
35#[cfg(any(feature = "rows", feature = "object"))]
36pub mod row;
37mod top_k;
38mod upstream_traits;
39mod validation;
40
41use arrow::record_batch::{RecordBatch, RecordBatchT};
42use polars_utils::pl_str::PlSmallStr;
43#[cfg(feature = "serde")]
44use serde::{Deserialize, Serialize};
45use strum_macros::IntoStaticStr;
46
47use crate::POOL;
48#[cfg(feature = "row_hash")]
49use crate::hashing::_df_rows_to_hashes_threaded_vertical;
50use crate::prelude::sort::arg_sort;
51use crate::series::IsSorted;
52
53#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]
54#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
55#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
56#[strum(serialize_all = "snake_case")]
57pub enum UniqueKeepStrategy {
58 /// Keep the first unique row.
59 First,
60 /// Keep the last unique row.
61 Last,
62 /// Keep None of the unique rows.
63 None,
64 /// Keep any of the unique rows
65 /// This allows more optimizations
66 #[default]
67 Any,
68}
69
70fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>
71where
72 F: for<'a> FnMut(&'a T) -> &'a str,
73{
74 // Always unique.
75 if items.len() <= 1 {
76 return Ok(());
77 }
78
79 if items.len() <= 4 {
80 // Too small to be worth spawning a hashmap for, this is at most 6 comparisons.
81 for i in 0..items.len() - 1 {
82 let name = get_name(&items[i]);
83 for other in items.iter().skip(i + 1) {
84 if name == get_name(other) {
85 polars_bail!(duplicate = name);
86 }
87 }
88 }
89 } else {
90 let mut names = PlHashSet::with_capacity(items.len());
91 for item in items {
92 let name = get_name(item);
93 if !names.insert(name) {
94 polars_bail!(duplicate = name);
95 }
96 }
97 }
98 Ok(())
99}
100
101/// A contiguous growable collection of `Series` that have the same length.
102///
103/// ## Use declarations
104///
105/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
106///
107/// ```rust
108/// use polars_core::prelude::*; // if the crate polars-core is used directly
109/// // use polars::prelude::*; if the crate polars is used
110/// ```
111///
112/// # Initialization
113/// ## Default
114///
115/// A `DataFrame` can be initialized empty:
116///
117/// ```rust
118/// # use polars_core::prelude::*;
119/// let df = DataFrame::default();
120/// assert!(df.is_empty());
121/// ```
122///
123/// ## Wrapping a `Vec<Series>`
124///
125/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
126///
127/// ```rust
128/// # use polars_core::prelude::*;
129/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
130/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
131///
132/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);
133/// ```
134///
135/// ## Using a macro
136///
137/// The [`df!`] macro is a convenient method:
138///
139/// ```rust
140/// # use polars_core::prelude::*;
141/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
142/// "Color" => ["Red", "Yellow", "Green"]);
143/// ```
144///
145/// ## Using a CSV file
146///
147/// See the `polars_io::csv::CsvReader`.
148///
149/// # Indexing
150/// ## By a number
151///
152/// The `Index<usize>` is implemented for the `DataFrame`.
153///
154/// ```rust
155/// # use polars_core::prelude::*;
156/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
157/// "Color" => ["Red", "Yellow", "Green"])?;
158///
159/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
160/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
161/// # Ok::<(), PolarsError>(())
162/// ```
163///
164/// ## By a `Series` name
165///
166/// ```rust
167/// # use polars_core::prelude::*;
168/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
169/// "Color" => ["Red", "Yellow", "Green"])?;
170///
171/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
172/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
173/// # Ok::<(), PolarsError>(())
174/// ```
175#[derive(Clone)]
176pub struct DataFrame {
177 height: usize,
178 // invariant: columns[i].len() == height for each 0 >= i > columns.len()
179 pub(crate) columns: Vec<Column>,
180
181 /// A cached schema. This might not give correct results if the DataFrame was modified in place
182 /// between schema and reading.
183 cached_schema: OnceLock<SchemaRef>,
184}
185
186impl DataFrame {
187 pub fn clear_schema(&mut self) {
188 self.cached_schema = OnceLock::new();
189 }
190
191 #[inline]
192 pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {
193 self.columns.iter()
194 }
195
196 #[inline]
197 pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {
198 self.columns.iter().map(Column::as_materialized_series)
199 }
200
201 #[inline]
202 pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {
203 self.columns.par_iter().map(Column::as_materialized_series)
204 }
205
206 /// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.
207 ///
208 /// # Implementation
209 /// This estimation is the sum of the size of its buffers, validity, including nested arrays.
210 /// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the
211 /// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.
212 ///
213 /// When an array is sliced, its allocated size remains constant because the buffer unchanged.
214 /// However, this function will yield a smaller number. This is because this function returns
215 /// the visible size of the buffer, not its total capacity.
216 ///
217 /// FFI buffers are included in this estimation.
218 pub fn estimated_size(&self) -> usize {
219 self.columns.iter().map(Column::estimated_size).sum()
220 }
221
222 // Reduce monomorphization.
223 fn try_apply_columns(
224 &self,
225 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
226 ) -> PolarsResult<Vec<Column>> {
227 self.columns.iter().map(func).collect()
228 }
229 // Reduce monomorphization.
230 pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {
231 self.columns.iter().map(func).collect()
232 }
233 // Reduce monomorphization.
234 fn try_apply_columns_par(
235 &self,
236 func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),
237 ) -> PolarsResult<Vec<Column>> {
238 POOL.install(|| self.columns.par_iter().map(func).collect())
239 }
240 // Reduce monomorphization.
241 pub fn _apply_columns_par(
242 &self,
243 func: &(dyn Fn(&Column) -> Column + Send + Sync),
244 ) -> Vec<Column> {
245 POOL.install(|| self.columns.par_iter().map(func).collect())
246 }
247
248 /// Get the index of the column.
249 fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {
250 self.get_column_index(name)
251 .ok_or_else(|| polars_err!(col_not_found = name))
252 }
253
254 fn check_already_present(&self, name: &str) -> PolarsResult<()> {
255 polars_ensure!(
256 self.columns.iter().all(|s| s.name().as_str() != name),
257 Duplicate: "column with name {:?} is already present in the DataFrame", name
258 );
259 Ok(())
260 }
261
262 /// Reserve additional slots into the chunks of the series.
263 pub(crate) fn reserve_chunks(&mut self, additional: usize) {
264 for s in &mut self.columns {
265 if let Column::Series(s) = s {
266 // SAFETY:
267 // do not modify the data, simply resize.
268 unsafe { s.chunks_mut().reserve(additional) }
269 }
270 }
271 }
272
273 /// Create a DataFrame from a Vector of Columns.
274 ///
275 /// Errors if a column names are not unique, or if heights are not all equal.
276 ///
277 /// # Example
278 ///
279 /// ```
280 /// # use polars_core::prelude::*;
281 /// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
282 /// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
283 ///
284 /// let df = DataFrame::new(vec![s0, s1])?;
285 /// # Ok::<(), PolarsError>(())
286 /// ```
287 pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {
288 DataFrame::validate_columns_slice(&columns)
289 .map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
290 Ok(unsafe { Self::new_no_checks_height_from_first(columns) })
291 }
292
293 pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
294 for col in &columns {
295 polars_ensure!(
296 col.len() == height,
297 ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",
298 columns[0].name(), height, col.name(), col.len()
299 );
300 }
301
302 ensure_names_unique(&columns, |s| s.name().as_str())?;
303
304 Ok(DataFrame {
305 height,
306 columns,
307 cached_schema: OnceLock::new(),
308 })
309 }
310
311 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
312 /// columns to match the other columns.
313 pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
314 // The length of the longest non-unit length column determines the
315 // broadcast length. If all columns are unit-length the broadcast length
316 // is one.
317 let broadcast_len = columns
318 .iter()
319 .map(|s| s.len())
320 .filter(|l| *l != 1)
321 .max()
322 .unwrap_or(1);
323 Self::new_with_broadcast_len(columns, broadcast_len)
324 }
325
326 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
327 /// columns to broadcast_len.
328 pub fn new_with_broadcast_len(
329 columns: Vec<Column>,
330 broadcast_len: usize,
331 ) -> PolarsResult<Self> {
332 ensure_names_unique(&columns, |s| s.name().as_str())?;
333 unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }
334 }
335
336 /// Converts a sequence of columns into a DataFrame, broadcasting length-1
337 /// columns to match the other columns.
338 ///
339 /// # Safety
340 /// Does not check that the column names are unique (which they must be).
341 pub unsafe fn new_with_broadcast_no_namecheck(
342 mut columns: Vec<Column>,
343 broadcast_len: usize,
344 ) -> PolarsResult<Self> {
345 for col in &mut columns {
346 // Length not equal to the broadcast len, needs broadcast or is an error.
347 let len = col.len();
348 if len != broadcast_len {
349 if len != 1 {
350 let name = col.name().to_owned();
351 let extra_info =
352 if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {
353 format!(" (matching column '{}')", c.name())
354 } else {
355 String::new()
356 };
357 polars_bail!(
358 ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",
359 );
360 }
361 *col = col.new_from_index(0, broadcast_len);
362 }
363 }
364
365 let length = if columns.is_empty() { 0 } else { broadcast_len };
366
367 Ok(unsafe { DataFrame::new_no_checks(length, columns) })
368 }
369
370 pub fn new_from_index(&self, index: usize, height: usize) -> Self {
371 let cols = self.columns.iter().map(|c| c.new_from_index(index, height));
372 unsafe { Self::new_no_checks(height, cols.collect()) }
373 }
374
375 /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
376 ///
377 /// # Example
378 ///
379 /// ```rust
380 /// use polars_core::prelude::DataFrame;
381 /// static EMPTY: DataFrame = DataFrame::empty();
382 /// ```
383 pub const fn empty() -> Self {
384 Self::empty_with_height(0)
385 }
386
387 /// Creates an empty `DataFrame` with a specific `height`.
388 pub const fn empty_with_height(height: usize) -> Self {
389 DataFrame {
390 height,
391 columns: vec![],
392 cached_schema: OnceLock::new(),
393 }
394 }
395
396 /// Create an empty `DataFrame` with empty columns as per the `schema`.
397 pub fn empty_with_arc_schema(schema: Arc<Schema>) -> Self {
398 let mut df = Self::empty_with_schema(&schema);
399 df.cached_schema = OnceLock::from(schema);
400 df
401 }
402
403 /// Create an empty `DataFrame` with empty columns as per the `schema`.
404 pub fn empty_with_schema(schema: &Schema) -> Self {
405 let cols = schema
406 .iter()
407 .map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))
408 .collect();
409 unsafe { DataFrame::new_no_checks(0, cols) }
410 }
411
412 /// Create an empty `DataFrame` with empty columns as per the `schema`.
413 pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {
414 let cols = schema
415 .iter_values()
416 .map(|fld| {
417 Column::from(Series::new_empty(
418 fld.name.clone(),
419 &(DataType::from_arrow_field(fld)),
420 ))
421 })
422 .collect();
423 unsafe { DataFrame::new_no_checks(0, cols) }
424 }
425
426 /// Create a new `DataFrame` with the given schema, only containing nulls.
427 pub fn full_null(schema: &Schema, height: usize) -> Self {
428 let columns = schema
429 .iter_fields()
430 .map(|f| Column::full_null(f.name.clone(), height, f.dtype()))
431 .collect();
432 unsafe { DataFrame::new_no_checks(height, columns) }
433 }
434
435 /// Ensure this DataFrame matches the given schema. Casts null columns to
436 /// the expected schema if necessary (but nothing else).
437 pub fn ensure_matches_schema(&mut self, schema: &Schema) -> PolarsResult<()> {
438 let mut any_needed_cast = false;
439 for (col, (name, dt)) in self.columns.iter_mut().zip(schema.iter()) {
440 polars_ensure!(
441 col.name() == name,
442 SchemaMismatch: "column name mismatch: expected {:?}, found {:?}",
443 name,
444 col.name()
445 );
446
447 let needs_cast = col.dtype().matches_schema_type(dt)?;
448 any_needed_cast |= needs_cast;
449 if needs_cast {
450 *col = col.cast(dt)?;
451 }
452 }
453 if any_needed_cast {
454 self.clear_schema();
455 }
456 Ok(())
457 }
458
459 /// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.
460 ///
461 /// # Example
462 ///
463 /// ```rust
464 /// # use polars_core::prelude::*;
465 /// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);
466 /// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);
467 /// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;
468 ///
469 /// assert_eq!(df.pop(), Some(s2));
470 /// assert_eq!(df.pop(), Some(s1));
471 /// assert_eq!(df.pop(), None);
472 /// assert!(df.is_empty());
473 /// # Ok::<(), PolarsError>(())
474 /// ```
475 pub fn pop(&mut self) -> Option<Column> {
476 self.clear_schema();
477
478 self.columns.pop()
479 }
480
481 /// Add a new column at index 0 that counts the rows.
482 ///
483 /// # Example
484 ///
485 /// ```
486 /// # use polars_core::prelude::*;
487 /// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;
488 /// assert_eq!(df1.shape(), (4, 1));
489 ///
490 /// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;
491 /// assert_eq!(df2.shape(), (4, 2));
492 /// println!("{}", df2);
493 ///
494 /// # Ok::<(), PolarsError>(())
495 /// ```
496 ///
497 /// Output:
498 ///
499 /// ```text
500 /// shape: (4, 2)
501 /// +-----+----------+
502 /// | Id | Name |
503 /// | --- | --- |
504 /// | u32 | str |
505 /// +=====+==========+
506 /// | 0 | James |
507 /// +-----+----------+
508 /// | 1 | Mary |
509 /// +-----+----------+
510 /// | 2 | John |
511 /// +-----+----------+
512 /// | 3 | Patricia |
513 /// +-----+----------+
514 /// ```
515 pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {
516 let mut columns = Vec::with_capacity(self.columns.len() + 1);
517 let offset = offset.unwrap_or(0);
518
519 let col = Column::new_row_index(name, offset, self.height())?;
520 columns.push(col);
521 columns.extend_from_slice(&self.columns);
522 DataFrame::new(columns)
523 }
524
525 /// Add a row index column in place.
526 ///
527 /// # Safety
528 /// The caller should ensure the DataFrame does not already contain a column with the given name.
529 ///
530 /// # Panics
531 /// Panics if the resulting column would reach or overflow IdxSize::MAX.
532 pub unsafe fn with_row_index_mut(
533 &mut self,
534 name: PlSmallStr,
535 offset: Option<IdxSize>,
536 ) -> &mut Self {
537 // TODO: Make this function unsafe
538 debug_assert!(
539 self.columns.iter().all(|c| c.name() != &name),
540 "with_row_index_mut(): column with name {} already exists",
541 &name
542 );
543
544 let offset = offset.unwrap_or(0);
545 let col = Column::new_row_index(name, offset, self.height()).unwrap();
546
547 self.clear_schema();
548 self.columns.insert(0, col);
549 self
550 }
551
552 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
553 /// `Series`.
554 ///
555 /// Calculates the height from the first column or `0` if no columns are given.
556 ///
557 /// # Safety
558 ///
559 /// It is the callers responsibility to uphold the contract of all `Series`
560 /// having an equal length and a unique name, if not this may panic down the line.
561 pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {
562 let height = columns.first().map_or(0, Column::len);
563 unsafe { Self::new_no_checks(height, columns) }
564 }
565
566 /// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
567 /// `Series`.
568 ///
569 /// It is advised to use [DataFrame::new] in favor of this method.
570 ///
571 /// # Safety
572 ///
573 /// It is the callers responsibility to uphold the contract of all `Series`
574 /// having an equal length and a unique name, if not this may panic down the line.
575 pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {
576 if cfg!(debug_assertions) {
577 DataFrame::validate_columns_slice(&columns).unwrap();
578 }
579
580 unsafe { Self::_new_no_checks_impl(height, columns) }
581 }
582
583 /// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
584 /// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
585 /// constructed with this method is generally highly unsafe and should not be long-lived.
586 #[allow(clippy::missing_safety_doc)]
587 pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {
588 DataFrame {
589 height,
590 columns,
591 cached_schema: OnceLock::new(),
592 }
593 }
594
595 /// Shrink the capacity of this DataFrame to fit its length.
596 pub fn shrink_to_fit(&mut self) {
597 // Don't parallelize this. Memory overhead
598 for s in &mut self.columns {
599 s.shrink_to_fit();
600 }
601 }
602
603 /// Aggregate all the chunks in the DataFrame to a single chunk.
604 pub fn as_single_chunk(&mut self) -> &mut Self {
605 // Don't parallelize this. Memory overhead
606 for s in &mut self.columns {
607 *s = s.rechunk();
608 }
609 self
610 }
611
612 /// Aggregate all the chunks in the DataFrame to a single chunk in parallel.
613 /// This may lead to more peak memory consumption.
614 pub fn as_single_chunk_par(&mut self) -> &mut Self {
615 if self.columns.iter().any(|c| c.n_chunks() > 1) {
616 self.columns = self._apply_columns_par(&|s| s.rechunk());
617 }
618 self
619 }
620
621 /// Rechunks all columns to only have a single chunk.
622 pub fn rechunk_mut(&mut self) {
623 // SAFETY: We never adjust the length or names of the columns.
624 let columns = unsafe { self.get_columns_mut() };
625
626 for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {
627 *col = col.rechunk();
628 }
629 }
630
631 pub fn _deshare_views_mut(&mut self) {
632 // SAFETY: We never adjust the length or names of the columns.
633 unsafe {
634 let columns = self.get_columns_mut();
635 for col in columns {
636 let Column::Series(s) = col else { continue };
637
638 if let Ok(ca) = s.binary() {
639 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
640 *col = Column::from(gc_ca.into_series());
641 } else if let Ok(ca) = s.str() {
642 let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());
643 *col = Column::from(gc_ca.into_series());
644 }
645 }
646 }
647 }
648
649 /// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].
650 pub fn rechunk_to_record_batch(
651 self,
652 compat_level: CompatLevel,
653 ) -> RecordBatchT<Box<dyn Array>> {
654 let height = self.height();
655
656 let (schema, arrays) = self
657 .columns
658 .into_iter()
659 .map(|col| {
660 let mut series = col.take_materialized_series();
661 // Rechunk to one chunk if necessary
662 if series.n_chunks() > 1 {
663 series = series.rechunk();
664 }
665 (
666 series.field().to_arrow(compat_level),
667 series.to_arrow(0, compat_level),
668 )
669 })
670 .collect();
671
672 RecordBatchT::new(height, Arc::new(schema), arrays)
673 }
674
675 /// Returns true if the chunks of the columns do not align and re-chunking should be done
676 pub fn should_rechunk(&self) -> bool {
677 // Fast check. It is also needed for correctness, as code below doesn't check if the number
678 // of chunks is equal.
679 if !self
680 .get_columns()
681 .iter()
682 .filter_map(|c| c.as_series().map(|s| s.n_chunks()))
683 .all_equal()
684 {
685 return true;
686 }
687
688 // From here we check chunk lengths.
689 let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());
690 match chunk_lengths.next() {
691 None => false,
692 Some(first_column_chunk_lengths) => {
693 // Fast Path for single Chunk Series
694 if first_column_chunk_lengths.size_hint().0 == 1 {
695 return chunk_lengths.any(|cl| cl.size_hint().0 != 1);
696 }
697 // Always rechunk if we have more chunks than rows.
698 // except when we have an empty df containing a single chunk
699 let height = self.height();
700 let n_chunks = first_column_chunk_lengths.size_hint().0;
701 if n_chunks > height && !(height == 0 && n_chunks == 1) {
702 return true;
703 }
704 // Slow Path for multi Chunk series
705 let v: Vec<_> = first_column_chunk_lengths.collect();
706 for cl in chunk_lengths {
707 if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {
708 return true;
709 }
710 }
711 false
712 },
713 }
714 }
715
716 /// Ensure all the chunks in the [`DataFrame`] are aligned.
717 pub fn align_chunks_par(&mut self) -> &mut Self {
718 if self.should_rechunk() {
719 self.as_single_chunk_par()
720 } else {
721 self
722 }
723 }
724
725 pub fn align_chunks(&mut self) -> &mut Self {
726 if self.should_rechunk() {
727 self.as_single_chunk()
728 } else {
729 self
730 }
731 }
732
733 /// Get the [`DataFrame`] schema.
734 ///
735 /// # Example
736 ///
737 /// ```rust
738 /// # use polars_core::prelude::*;
739 /// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],
740 /// "Diameter (m)" => [8.8e26, f64::INFINITY])?;
741 ///
742 /// let f1: Field = Field::new("Thing".into(), DataType::String);
743 /// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);
744 /// let sc: Schema = Schema::from_iter(vec![f1, f2]);
745 ///
746 /// assert_eq!(&**df.schema(), &sc);
747 /// # Ok::<(), PolarsError>(())
748 /// ```
749 pub fn schema(&self) -> &SchemaRef {
750 let out = self.cached_schema.get_or_init(|| {
751 Arc::new(
752 self.columns
753 .iter()
754 .map(|x| (x.name().clone(), x.dtype().clone()))
755 .collect(),
756 )
757 });
758
759 debug_assert_eq!(out.len(), self.width());
760
761 out
762 }
763
764 /// Get a reference to the [`DataFrame`] columns.
765 ///
766 /// # Example
767 ///
768 /// ```rust
769 /// # use polars_core::prelude::*;
770 /// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],
771 /// "Symbol" => ["A", "C", "G", "T"])?;
772 /// let columns: &[Column] = df.get_columns();
773 ///
774 /// assert_eq!(columns[0].name(), "Name");
775 /// assert_eq!(columns[1].name(), "Symbol");
776 /// # Ok::<(), PolarsError>(())
777 /// ```
778 #[inline]
779 pub fn get_columns(&self) -> &[Column] {
780 &self.columns
781 }
782
783 #[inline]
784 /// Get mutable access to the underlying columns.
785 ///
786 /// # Safety
787 ///
788 /// The caller must ensure the length of all [`Series`] remains equal to `height` or
789 /// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.
790 /// The caller must ensure that the cached schema is cleared if it modifies the schema by
791 /// calling [`DataFrame::clear_schema`].
792 pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {
793 &mut self.columns
794 }
795
796 #[inline]
797 /// Remove all the columns in the [`DataFrame`] but keep the `height`.
798 pub fn clear_columns(&mut self) {
799 unsafe { self.get_columns_mut() }.clear();
800 self.clear_schema();
801 }
802
803 #[inline]
804 /// Extend the columns without checking for name collisions or height.
805 ///
806 /// # Safety
807 ///
808 /// The caller needs to ensure that:
809 /// - Column names are unique within the resulting [`DataFrame`].
810 /// - The length of each appended column matches the height of the [`DataFrame`]. For
811 /// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards
812 /// with [`DataFrame::set_height`].
813 pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {
814 unsafe { self.get_columns_mut() }.extend(iter);
815 self.clear_schema();
816 }
817
818 /// Take ownership of the underlying columns vec.
819 pub fn take_columns(self) -> Vec<Column> {
820 self.columns
821 }
822
823 /// Iterator over the columns as [`Series`].
824 ///
825 /// # Example
826 ///
827 /// ```rust
828 /// # use polars_core::prelude::*;
829 /// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);
830 /// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);
831 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;
832 ///
833 /// let mut iterator = df.iter();
834 ///
835 /// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));
836 /// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));
837 /// assert_eq!(iterator.next(), None);
838 /// # Ok::<(), PolarsError>(())
839 /// ```
840 pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {
841 self.materialized_column_iter()
842 }
843
844 /// # Example
845 ///
846 /// ```rust
847 /// # use polars_core::prelude::*;
848 /// let df: DataFrame = df!("Language" => ["Rust", "Python"],
849 /// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;
850 ///
851 /// assert_eq!(df.get_column_names(), &["Language", "Designer"]);
852 /// # Ok::<(), PolarsError>(())
853 /// ```
854 pub fn get_column_names(&self) -> Vec<&PlSmallStr> {
855 self.columns.iter().map(|s| s.name()).collect()
856 }
857
858 /// Get the [`Vec<PlSmallStr>`] representing the column names.
859 pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {
860 self.columns.iter().map(|s| s.name().clone()).collect()
861 }
862
863 pub fn get_column_names_str(&self) -> Vec<&str> {
864 self.columns.iter().map(|s| s.name().as_str()).collect()
865 }
866
867 /// Set the column names.
868 /// # Example
869 ///
870 /// ```rust
871 /// # use polars_core::prelude::*;
872 /// let mut df: DataFrame = df!("Mathematical set" => ["ā", "ā¤", "š»", "ā", "ā", "ā"])?;
873 /// df.set_column_names(["Set"])?;
874 ///
875 /// assert_eq!(df.get_column_names(), &["Set"]);
876 /// # Ok::<(), PolarsError>(())
877 /// ```
878 pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>
879 where
880 I: IntoIterator<Item = S>,
881 S: Into<PlSmallStr>,
882 {
883 let names = names.into_iter().map(Into::into).collect::<Vec<_>>();
884 self._set_column_names_impl(names.as_slice())
885 }
886
887 fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {
888 polars_ensure!(
889 names.len() == self.width(),
890 ShapeMismatch: "{} column names provided for a DataFrame of width {}",
891 names.len(), self.width()
892 );
893 ensure_names_unique(names, |s| s.as_str())?;
894
895 let columns = mem::take(&mut self.columns);
896 self.columns = columns
897 .into_iter()
898 .zip(names)
899 .map(|(s, name)| {
900 let mut s = s;
901 s.rename(name.clone());
902 s
903 })
904 .collect();
905 self.clear_schema();
906 Ok(())
907 }
908
909 /// Get the data types of the columns in the [`DataFrame`].
910 ///
911 /// # Example
912 ///
913 /// ```rust
914 /// # use polars_core::prelude::*;
915 /// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],
916 /// "Fraction" => [0.965, 0.035])?;
917 ///
918 /// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);
919 /// # Ok::<(), PolarsError>(())
920 /// ```
921 pub fn dtypes(&self) -> Vec<DataType> {
922 self.columns.iter().map(|s| s.dtype().clone()).collect()
923 }
924
925 pub(crate) fn first_series_column(&self) -> Option<&Series> {
926 self.columns.iter().find_map(|col| col.as_series())
927 }
928
929 /// The number of chunks for the first column.
930 pub fn first_col_n_chunks(&self) -> usize {
931 match self.first_series_column() {
932 None if self.columns.is_empty() => 0,
933 None => 1,
934 Some(s) => s.n_chunks(),
935 }
936 }
937
938 /// The highest number of chunks for any column.
939 pub fn max_n_chunks(&self) -> usize {
940 self.columns
941 .iter()
942 .map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))
943 .max()
944 .unwrap_or(0)
945 }
946
947 /// Get a reference to the schema fields of the [`DataFrame`].
948 ///
949 /// # Example
950 ///
951 /// ```rust
952 /// # use polars_core::prelude::*;
953 /// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],
954 /// "Fraction" => [0.708, 0.292])?;
955 ///
956 /// let f1: Field = Field::new("Surface type".into(), DataType::String);
957 /// let f2: Field = Field::new("Fraction".into(), DataType::Float64);
958 ///
959 /// assert_eq!(earth.fields(), &[f1, f2]);
960 /// # Ok::<(), PolarsError>(())
961 /// ```
962 pub fn fields(&self) -> Vec<Field> {
963 self.columns
964 .iter()
965 .map(|s| s.field().into_owned())
966 .collect()
967 }
968
969 /// Get (height, width) of the [`DataFrame`].
970 ///
971 /// # Example
972 ///
973 /// ```rust
974 /// # use polars_core::prelude::*;
975 /// let df0: DataFrame = DataFrame::default();
976 /// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
977 /// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
978 /// "2" => [1, 2, 3, 4, 5])?;
979 ///
980 /// assert_eq!(df0.shape(), (0 ,0));
981 /// assert_eq!(df1.shape(), (5, 1));
982 /// assert_eq!(df2.shape(), (5, 2));
983 /// # Ok::<(), PolarsError>(())
984 /// ```
985 pub fn shape(&self) -> (usize, usize) {
986 (self.height, self.columns.len())
987 }
988
989 /// Get the width of the [`DataFrame`] which is the number of columns.
990 ///
991 /// # Example
992 ///
993 /// ```rust
994 /// # use polars_core::prelude::*;
995 /// let df0: DataFrame = DataFrame::default();
996 /// let df1: DataFrame = df!("Series 1" => [0; 0])?;
997 /// let df2: DataFrame = df!("Series 1" => [0; 0],
998 /// "Series 2" => [0; 0])?;
999 ///
1000 /// assert_eq!(df0.width(), 0);
1001 /// assert_eq!(df1.width(), 1);
1002 /// assert_eq!(df2.width(), 2);
1003 /// # Ok::<(), PolarsError>(())
1004 /// ```
1005 pub fn width(&self) -> usize {
1006 self.columns.len()
1007 }
1008
1009 /// Get the height of the [`DataFrame`] which is the number of rows.
1010 ///
1011 /// # Example
1012 ///
1013 /// ```rust
1014 /// # use polars_core::prelude::*;
1015 /// let df0: DataFrame = DataFrame::default();
1016 /// let df1: DataFrame = df!("Currency" => ["ā¬", "$"])?;
1017 /// let df2: DataFrame = df!("Currency" => ["ā¬", "$", "Ā„", "Ā£", "āæ"])?;
1018 ///
1019 /// assert_eq!(df0.height(), 0);
1020 /// assert_eq!(df1.height(), 2);
1021 /// assert_eq!(df2.height(), 5);
1022 /// # Ok::<(), PolarsError>(())
1023 /// ```
1024 pub fn height(&self) -> usize {
1025 self.height
1026 }
1027
1028 /// Returns the size as number of rows * number of columns
1029 pub fn size(&self) -> usize {
1030 let s = self.shape();
1031 s.0 * s.1
1032 }
1033
1034 /// Returns `true` if the [`DataFrame`] contains no rows.
1035 ///
1036 /// # Example
1037 ///
1038 /// ```rust
1039 /// # use polars_core::prelude::*;
1040 /// let df1: DataFrame = DataFrame::default();
1041 /// assert!(df1.is_empty());
1042 ///
1043 /// let df2: DataFrame = df!("First name" => ["Forever"],
1044 /// "Last name" => ["Alone"])?;
1045 /// assert!(!df2.is_empty());
1046 /// # Ok::<(), PolarsError>(())
1047 /// ```
1048 pub fn is_empty(&self) -> bool {
1049 matches!(self.shape(), (0, _) | (_, 0))
1050 }
1051
1052 /// Set the height (i.e. number of rows) of this [`DataFrame`].
1053 ///
1054 /// # Safety
1055 ///
1056 /// This needs to be equal to the length of all the columns.
1057 pub unsafe fn set_height(&mut self, height: usize) {
1058 self.height = height;
1059 }
1060
1061 /// Add multiple [`Series`] to a [`DataFrame`].
1062 /// The added `Series` are required to have the same length.
1063 ///
1064 /// # Example
1065 ///
1066 /// ```rust
1067 /// # use polars_core::prelude::*;
1068 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;
1069 /// let s1 = Column::new("Proton".into(), [29, 47, 79]);
1070 /// let s2 = Column::new("Electron".into(), [29, 47, 79]);
1071 ///
1072 /// let df2: DataFrame = df1.hstack(&[s1, s2])?;
1073 /// assert_eq!(df2.shape(), (3, 3));
1074 /// println!("{}", df2);
1075 /// # Ok::<(), PolarsError>(())
1076 /// ```
1077 ///
1078 /// Output:
1079 ///
1080 /// ```text
1081 /// shape: (3, 3)
1082 /// +---------+--------+----------+
1083 /// | Element | Proton | Electron |
1084 /// | --- | --- | --- |
1085 /// | str | i32 | i32 |
1086 /// +=========+========+==========+
1087 /// | Copper | 29 | 29 |
1088 /// +---------+--------+----------+
1089 /// | Silver | 47 | 47 |
1090 /// +---------+--------+----------+
1091 /// | Gold | 79 | 79 |
1092 /// +---------+--------+----------+
1093 /// ```
1094 pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {
1095 let mut new_cols = self.columns.clone();
1096 new_cols.extend_from_slice(columns);
1097 DataFrame::new(new_cols)
1098 }
1099
1100 /// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].
1101 ///
1102 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1103 ///
1104 /// # Example
1105 ///
1106 /// ```rust
1107 /// # use polars_core::prelude::*;
1108 /// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1109 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1110 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1111 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1112 ///
1113 /// let df3: DataFrame = df1.vstack(&df2)?;
1114 ///
1115 /// assert_eq!(df3.shape(), (5, 2));
1116 /// println!("{}", df3);
1117 /// # Ok::<(), PolarsError>(())
1118 /// ```
1119 ///
1120 /// Output:
1121 ///
1122 /// ```text
1123 /// shape: (5, 2)
1124 /// +-----------+-------------------+
1125 /// | Element | Melting Point (K) |
1126 /// | --- | --- |
1127 /// | str | f64 |
1128 /// +===========+===================+
1129 /// | Copper | 1357.77 |
1130 /// +-----------+-------------------+
1131 /// | Silver | 1234.93 |
1132 /// +-----------+-------------------+
1133 /// | Gold | 1337.33 |
1134 /// +-----------+-------------------+
1135 /// | Platinum | 2041.4 |
1136 /// +-----------+-------------------+
1137 /// | Palladium | 1828.05 |
1138 /// +-----------+-------------------+
1139 /// ```
1140 pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {
1141 let mut df = self.clone();
1142 df.vstack_mut(other)?;
1143 Ok(df)
1144 }
1145
1146 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1147 ///
1148 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1149 ///
1150 /// # Example
1151 ///
1152 /// ```rust
1153 /// # use polars_core::prelude::*;
1154 /// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],
1155 /// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;
1156 /// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],
1157 /// "Melting Point (K)" => [2041.4, 1828.05])?;
1158 ///
1159 /// df1.vstack_mut(&df2)?;
1160 ///
1161 /// assert_eq!(df1.shape(), (5, 2));
1162 /// println!("{}", df1);
1163 /// # Ok::<(), PolarsError>(())
1164 /// ```
1165 ///
1166 /// Output:
1167 ///
1168 /// ```text
1169 /// shape: (5, 2)
1170 /// +-----------+-------------------+
1171 /// | Element | Melting Point (K) |
1172 /// | --- | --- |
1173 /// | str | f64 |
1174 /// +===========+===================+
1175 /// | Copper | 1357.77 |
1176 /// +-----------+-------------------+
1177 /// | Silver | 1234.93 |
1178 /// +-----------+-------------------+
1179 /// | Gold | 1337.33 |
1180 /// +-----------+-------------------+
1181 /// | Platinum | 2041.4 |
1182 /// +-----------+-------------------+
1183 /// | Palladium | 1828.05 |
1184 /// +-----------+-------------------+
1185 /// ```
1186 pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {
1187 if self.width() != other.width() {
1188 polars_ensure!(
1189 self.width() == 0,
1190 ShapeMismatch:
1191 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1192 self.width(), other.width(),
1193 );
1194 self.columns.clone_from(&other.columns);
1195 self.height = other.height;
1196 return Ok(self);
1197 }
1198
1199 self.columns
1200 .iter_mut()
1201 .zip(other.columns.iter())
1202 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1203 ensure_can_extend(&*left, right)?;
1204 left.append(right).map_err(|e| {
1205 e.context(format!("failed to vstack column '{}'", right.name()).into())
1206 })?;
1207 Ok(())
1208 })?;
1209 self.height += other.height;
1210 Ok(self)
1211 }
1212
1213 pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {
1214 if self.width() != other.width() {
1215 polars_ensure!(
1216 self.width() == 0,
1217 ShapeMismatch:
1218 "unable to append to a DataFrame of width {} with a DataFrame of width {}",
1219 self.width(), other.width(),
1220 );
1221 self.columns = other.columns;
1222 self.height = other.height;
1223 return Ok(self);
1224 }
1225
1226 self.columns
1227 .iter_mut()
1228 .zip(other.columns.into_iter())
1229 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1230 ensure_can_extend(&*left, &right)?;
1231 let right_name = right.name().clone();
1232 left.append_owned(right).map_err(|e| {
1233 e.context(format!("failed to vstack column '{right_name}'").into())
1234 })?;
1235 Ok(())
1236 })?;
1237 self.height += other.height;
1238 Ok(self)
1239 }
1240
1241 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1242 ///
1243 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1244 ///
1245 /// # Panics
1246 /// Panics if the schema's don't match.
1247 pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {
1248 self.columns
1249 .iter_mut()
1250 .zip(other.columns.iter())
1251 .for_each(|(left, right)| {
1252 left.append(right)
1253 .map_err(|e| {
1254 e.context(format!("failed to vstack column '{}'", right.name()).into())
1255 })
1256 .expect("should not fail");
1257 });
1258 self.height += other.height;
1259 }
1260
1261 /// Concatenate a [`DataFrame`] to this [`DataFrame`]
1262 ///
1263 /// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].
1264 ///
1265 /// # Panics
1266 /// Panics if the schema's don't match.
1267 pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {
1268 self.columns
1269 .iter_mut()
1270 .zip(other.columns)
1271 .for_each(|(left, right)| {
1272 left.append_owned(right).expect("should not fail");
1273 });
1274 self.height += other.height;
1275 }
1276
1277 /// Extend the memory backed by this [`DataFrame`] with the values from `other`.
1278 ///
1279 /// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]
1280 /// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.
1281 ///
1282 /// If this does not cause a reallocation, the resulting data structure will not have any extra chunks
1283 /// and thus will yield faster queries.
1284 ///
1285 /// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during
1286 /// online operations where you add `n` rows and rerun a query.
1287 ///
1288 /// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance
1289 /// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence
1290 /// of `append` operations with a [`rechunk`](Self::align_chunks_par).
1291 pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {
1292 polars_ensure!(
1293 self.width() == other.width(),
1294 ShapeMismatch:
1295 "unable to extend a DataFrame of width {} with a DataFrame of width {}",
1296 self.width(), other.width(),
1297 );
1298
1299 self.columns
1300 .iter_mut()
1301 .zip(other.columns.iter())
1302 .try_for_each::<_, PolarsResult<_>>(|(left, right)| {
1303 ensure_can_extend(&*left, right)?;
1304 left.extend(right).map_err(|e| {
1305 e.context(format!("failed to extend column '{}'", right.name()).into())
1306 })?;
1307 Ok(())
1308 })?;
1309 self.height += other.height;
1310 self.clear_schema();
1311 Ok(())
1312 }
1313
1314 /// Remove a column by name and return the column removed.
1315 ///
1316 /// # Example
1317 ///
1318 /// ```rust
1319 /// # use polars_core::prelude::*;
1320 /// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],
1321 /// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;
1322 ///
1323 /// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");
1324 /// assert!(s1.is_err());
1325 ///
1326 /// let s2: Column = df.drop_in_place("Animal")?;
1327 /// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));
1328 /// # Ok::<(), PolarsError>(())
1329 /// ```
1330 pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {
1331 let idx = self.check_name_to_idx(name)?;
1332 self.clear_schema();
1333 Ok(self.columns.remove(idx))
1334 }
1335
1336 /// Return a new [`DataFrame`] where all null values are dropped.
1337 ///
1338 /// # Example
1339 ///
1340 /// ```no_run
1341 /// # use polars_core::prelude::*;
1342 /// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],
1343 /// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;
1344 /// assert_eq!(df1.shape(), (3, 2));
1345 ///
1346 /// let df2: DataFrame = df1.drop_nulls::<String>(None)?;
1347 /// assert_eq!(df2.shape(), (1, 2));
1348 /// println!("{}", df2);
1349 /// # Ok::<(), PolarsError>(())
1350 /// ```
1351 ///
1352 /// Output:
1353 ///
1354 /// ```text
1355 /// shape: (1, 2)
1356 /// +---------+---------------------+
1357 /// | Country | Tax revenue (% GDP) |
1358 /// | --- | --- |
1359 /// | str | f64 |
1360 /// +=========+=====================+
1361 /// | Malta | 32.7 |
1362 /// +---------+---------------------+
1363 /// ```
1364 pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>
1365 where
1366 for<'a> &'a S: Into<PlSmallStr>,
1367 {
1368 if let Some(v) = subset {
1369 let v = self.select_columns(v)?;
1370 self._drop_nulls_impl(v.as_slice())
1371 } else {
1372 self._drop_nulls_impl(self.columns.as_slice())
1373 }
1374 }
1375
1376 fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {
1377 // fast path for no nulls in df
1378 if subset.iter().all(|s| !s.has_nulls()) {
1379 return Ok(self.clone());
1380 }
1381
1382 let mut iter = subset.iter();
1383
1384 let mask = iter
1385 .next()
1386 .ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;
1387 let mut mask = mask.is_not_null();
1388
1389 for c in iter {
1390 mask = mask & c.is_not_null();
1391 }
1392 self.filter(&mask)
1393 }
1394
1395 /// Drop a column by name.
1396 /// This is a pure method and will return a new [`DataFrame`] instead of modifying
1397 /// the current one in place.
1398 ///
1399 /// # Example
1400 ///
1401 /// ```rust
1402 /// # use polars_core::prelude::*;
1403 /// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;
1404 /// let df2: DataFrame = df1.drop("Ray type")?;
1405 ///
1406 /// assert!(df2.is_empty());
1407 /// # Ok::<(), PolarsError>(())
1408 /// ```
1409 pub fn drop(&self, name: &str) -> PolarsResult<Self> {
1410 let idx = self.check_name_to_idx(name)?;
1411 let mut new_cols = Vec::with_capacity(self.columns.len() - 1);
1412
1413 self.columns.iter().enumerate().for_each(|(i, s)| {
1414 if i != idx {
1415 new_cols.push(s.clone())
1416 }
1417 });
1418
1419 Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })
1420 }
1421
1422 /// Drop columns that are in `names`.
1423 pub fn drop_many<I, S>(&self, names: I) -> Self
1424 where
1425 I: IntoIterator<Item = S>,
1426 S: Into<PlSmallStr>,
1427 {
1428 let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();
1429 self.drop_many_amortized(&names)
1430 }
1431
1432 /// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).
1433 pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {
1434 if names.is_empty() {
1435 return self.clone();
1436 }
1437 let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));
1438 self.columns.iter().for_each(|s| {
1439 if !names.contains(s.name()) {
1440 new_cols.push(s.clone())
1441 }
1442 });
1443
1444 unsafe { DataFrame::new_no_checks(self.height(), new_cols) }
1445 }
1446
1447 /// Insert a new column at a given index without checking for duplicates.
1448 /// This can leave the [`DataFrame`] at an invalid state
1449 fn insert_column_no_name_check(
1450 &mut self,
1451 index: usize,
1452 column: Column,
1453 ) -> PolarsResult<&mut Self> {
1454 polars_ensure!(
1455 self.width() == 0 || column.len() == self.height(),
1456 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1457 column.len(), self.height(),
1458 );
1459
1460 if self.width() == 0 {
1461 self.height = column.len();
1462 }
1463
1464 self.columns.insert(index, column);
1465 self.clear_schema();
1466 Ok(self)
1467 }
1468
1469 /// Insert a new column at a given index.
1470 pub fn insert_column<S: IntoColumn>(
1471 &mut self,
1472 index: usize,
1473 column: S,
1474 ) -> PolarsResult<&mut Self> {
1475 let column = column.into_column();
1476 self.check_already_present(column.name().as_str())?;
1477 self.insert_column_no_name_check(index, column)
1478 }
1479
1480 fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {
1481 if let Some(idx) = self.get_column_index(column.name().as_str()) {
1482 self.replace_column(idx, column)?;
1483 } else {
1484 if self.width() == 0 {
1485 self.height = column.len();
1486 }
1487
1488 self.columns.push(column);
1489 self.clear_schema();
1490 }
1491 Ok(())
1492 }
1493
1494 /// Add a new column to this [`DataFrame`] or replace an existing one.
1495 pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {
1496 fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {
1497 let height = df.height();
1498 if column.len() == 1 && height > 1 {
1499 column = column.new_from_index(0, height);
1500 }
1501
1502 if column.len() == height || df.get_columns().is_empty() {
1503 df.add_column_by_search(column)?;
1504 Ok(df)
1505 }
1506 // special case for literals
1507 else if height == 0 && column.len() == 1 {
1508 let s = column.clear();
1509 df.add_column_by_search(s)?;
1510 Ok(df)
1511 } else {
1512 polars_bail!(
1513 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1514 column.len(), height,
1515 );
1516 }
1517 }
1518 let column = column.into_column();
1519 inner(self, column)
1520 }
1521
1522 /// Adds a column to the [`DataFrame`] without doing any checks
1523 /// on length or duplicates.
1524 ///
1525 /// # Safety
1526 /// The caller must ensure `self.width() == 0 || column.len() == self.height()` .
1527 pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {
1528 debug_assert!(self.width() == 0 || self.height() == column.len());
1529 debug_assert!(self.get_column_index(column.name().as_str()).is_none());
1530
1531 // SAFETY: Invariant of function guarantees for case `width` > 0. We set the height
1532 // properly for `width` == 0.
1533 if self.width() == 0 {
1534 unsafe { self.set_height(column.len()) };
1535 }
1536 unsafe { self.get_columns_mut() }.push(column);
1537 self.clear_schema();
1538
1539 self
1540 }
1541
1542 // Note: Schema can be both input or output_schema
1543 fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {
1544 let name = c.name();
1545 if let Some((idx, _, _)) = schema.get_full(name.as_str()) {
1546 if self.columns.get(idx).map(|s| s.name()) != Some(name) {
1547 // Given schema is output_schema and we can push.
1548 if idx == self.columns.len() {
1549 if self.width() == 0 {
1550 self.height = c.len();
1551 }
1552
1553 self.columns.push(c);
1554 self.clear_schema();
1555 }
1556 // Schema is incorrect fallback to search
1557 else {
1558 debug_assert!(false);
1559 self.add_column_by_search(c)?;
1560 }
1561 } else {
1562 self.replace_column(idx, c)?;
1563 }
1564 } else {
1565 if self.width() == 0 {
1566 self.height = c.len();
1567 }
1568
1569 self.columns.push(c);
1570 self.clear_schema();
1571 }
1572
1573 Ok(())
1574 }
1575
1576 // Note: Schema can be both input or output_schema
1577 pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {
1578 for (i, s) in series.into_iter().enumerate() {
1579 // we need to branch here
1580 // because users can add multiple columns with the same name
1581 if i == 0 || schema.get(s.name().as_str()).is_some() {
1582 self.with_column_and_schema(s.into_column(), schema)?;
1583 } else {
1584 self.with_column(s.clone().into_column())?;
1585 }
1586 }
1587 Ok(())
1588 }
1589
1590 pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {
1591 for (i, s) in columns.into_iter().enumerate() {
1592 // we need to branch here
1593 // because users can add multiple columns with the same name
1594 if i == 0 || schema.get(s.name().as_str()).is_some() {
1595 self.with_column_and_schema(s, schema)?;
1596 } else {
1597 self.with_column(s.clone())?;
1598 }
1599 }
1600
1601 Ok(())
1602 }
1603
1604 /// Add a new column to this [`DataFrame`] or replace an existing one.
1605 /// Uses an existing schema to amortize lookups.
1606 /// If the schema is incorrect, we will fallback to linear search.
1607 ///
1608 /// Note: Schema can be both input or output_schema
1609 pub fn with_column_and_schema<C: IntoColumn>(
1610 &mut self,
1611 column: C,
1612 schema: &Schema,
1613 ) -> PolarsResult<&mut Self> {
1614 let mut column = column.into_column();
1615
1616 let height = self.height();
1617 if column.len() == 1 && height > 1 {
1618 column = column.new_from_index(0, height);
1619 }
1620
1621 if column.len() == height || self.columns.is_empty() {
1622 self.add_column_by_schema(column, schema)?;
1623 Ok(self)
1624 }
1625 // special case for literals
1626 else if height == 0 && column.len() == 1 {
1627 let s = column.clear();
1628 self.add_column_by_schema(s, schema)?;
1629 Ok(self)
1630 } else {
1631 polars_bail!(
1632 ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",
1633 column.len(), height,
1634 );
1635 }
1636 }
1637
1638 /// Get a row in the [`DataFrame`]. Beware this is slow.
1639 ///
1640 /// # Example
1641 ///
1642 /// ```
1643 /// # use polars_core::prelude::*;
1644 /// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {
1645 /// df.get(idx)
1646 /// }
1647 /// ```
1648 pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {
1649 match self.columns.first() {
1650 Some(s) => {
1651 if s.len() <= idx {
1652 return None;
1653 }
1654 },
1655 None => return None,
1656 }
1657 // SAFETY: we just checked bounds
1658 unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }
1659 }
1660
1661 /// Select a [`Series`] by index.
1662 ///
1663 /// # Example
1664 ///
1665 /// ```rust
1666 /// # use polars_core::prelude::*;
1667 /// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],
1668 /// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;
1669 ///
1670 /// let s1: Option<&Column> = df.select_at_idx(0);
1671 /// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);
1672 ///
1673 /// assert_eq!(s1, Some(&s2));
1674 /// # Ok::<(), PolarsError>(())
1675 /// ```
1676 pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {
1677 self.columns.get(idx)
1678 }
1679
1680 /// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]
1681 ///
1682 /// # Examples
1683 ///
1684 /// ```rust
1685 /// # use polars_core::prelude::*;
1686 /// let df = df! {
1687 /// "0" => [0, 0, 0],
1688 /// "1" => [1, 1, 1],
1689 /// "2" => [2, 2, 2]
1690 /// }?;
1691 ///
1692 /// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));
1693 /// assert!(df.equals(&df.select_by_range(..)?));
1694 /// # Ok::<(), PolarsError>(())
1695 /// ```
1696 pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>
1697 where
1698 R: ops::RangeBounds<usize>,
1699 {
1700 // This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)
1701 // because it is the nightly feature. We should change here if this function were stable.
1702 fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>
1703 where
1704 R: ops::RangeBounds<usize>,
1705 {
1706 let len = bounds.end;
1707
1708 let start: ops::Bound<&usize> = range.start_bound();
1709 let start = match start {
1710 ops::Bound::Included(&start) => start,
1711 ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {
1712 panic!("attempted to index slice from after maximum usize");
1713 }),
1714 ops::Bound::Unbounded => 0,
1715 };
1716
1717 let end: ops::Bound<&usize> = range.end_bound();
1718 let end = match end {
1719 ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {
1720 panic!("attempted to index slice up to maximum usize");
1721 }),
1722 ops::Bound::Excluded(&end) => end,
1723 ops::Bound::Unbounded => len,
1724 };
1725
1726 if start > end {
1727 panic!("slice index starts at {start} but ends at {end}");
1728 }
1729 if end > len {
1730 panic!("range end index {end} out of range for slice of length {len}",);
1731 }
1732
1733 ops::Range { start, end }
1734 }
1735
1736 let colnames = self.get_column_names_owned();
1737 let range = get_range(range, ..colnames.len());
1738
1739 self._select_impl(&colnames[range])
1740 }
1741
1742 /// Get column index of a [`Series`] by name.
1743 /// # Example
1744 ///
1745 /// ```rust
1746 /// # use polars_core::prelude::*;
1747 /// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],
1748 /// "Health" => [100, 200, 500],
1749 /// "Mana" => [250, 100, 0],
1750 /// "Strength" => [30, 150, 300])?;
1751 ///
1752 /// assert_eq!(df.get_column_index("Name"), Some(0));
1753 /// assert_eq!(df.get_column_index("Health"), Some(1));
1754 /// assert_eq!(df.get_column_index("Mana"), Some(2));
1755 /// assert_eq!(df.get_column_index("Strength"), Some(3));
1756 /// assert_eq!(df.get_column_index("Haste"), None);
1757 /// # Ok::<(), PolarsError>(())
1758 /// ```
1759 pub fn get_column_index(&self, name: &str) -> Option<usize> {
1760 let schema = self.schema();
1761 if let Some(idx) = schema.index_of(name) {
1762 if self
1763 .get_columns()
1764 .get(idx)
1765 .is_some_and(|c| c.name() == name)
1766 {
1767 return Some(idx);
1768 }
1769 }
1770
1771 self.columns.iter().position(|s| s.name().as_str() == name)
1772 }
1773
1774 /// Get column index of a [`Series`] by name.
1775 pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {
1776 self.get_column_index(name)
1777 .ok_or_else(|| polars_err!(col_not_found = name))
1778 }
1779
1780 /// Select a single column by name.
1781 ///
1782 /// # Example
1783 ///
1784 /// ```rust
1785 /// # use polars_core::prelude::*;
1786 /// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);
1787 /// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);
1788 /// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;
1789 ///
1790 /// assert_eq!(df.column("Password")?, &s1);
1791 /// # Ok::<(), PolarsError>(())
1792 /// ```
1793 pub fn column(&self, name: &str) -> PolarsResult<&Column> {
1794 let idx = self.try_get_column_index(name)?;
1795 Ok(self.select_at_idx(idx).unwrap())
1796 }
1797
1798 /// Selected multiple columns by name.
1799 ///
1800 /// # Example
1801 ///
1802 /// ```rust
1803 /// # use polars_core::prelude::*;
1804 /// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],
1805 /// "Max weight (kg)" => [16.0, 35.89])?;
1806 /// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;
1807 ///
1808 /// assert_eq!(&df[0], sv[0]);
1809 /// assert_eq!(&df[1], sv[1]);
1810 /// # Ok::<(), PolarsError>(())
1811 /// ```
1812 pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>
1813 where
1814 I: IntoIterator<Item = S>,
1815 S: AsRef<str>,
1816 {
1817 names
1818 .into_iter()
1819 .map(|name| self.column(name.as_ref()))
1820 .collect()
1821 }
1822
1823 /// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].
1824 ///
1825 /// # Examples
1826 ///
1827 /// ```
1828 /// # use polars_core::prelude::*;
1829 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
1830 /// df.select(["foo", "bar"])
1831 /// }
1832 /// ```
1833 pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>
1834 where
1835 I: IntoIterator<Item = S>,
1836 S: Into<PlSmallStr>,
1837 {
1838 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1839 self._select_impl(cols.as_slice())
1840 }
1841
1842 pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1843 ensure_names_unique(cols, |s| s.as_str())?;
1844 self._select_impl_unchecked(cols)
1845 }
1846
1847 pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1848 let selected = self.select_columns_impl(cols)?;
1849 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1850 }
1851
1852 /// Select with a known schema. The schema names must match the column names of this DataFrame.
1853 pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>
1854 where
1855 I: IntoIterator<Item = S>,
1856 S: Into<PlSmallStr>,
1857 {
1858 let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1859 self._select_with_schema_impl(&cols, schema, true)
1860 }
1861
1862 /// Select with a known schema without checking for duplicates in `selection`.
1863 /// The schema names must match the column names of this DataFrame.
1864 pub fn select_with_schema_unchecked<I, S>(
1865 &self,
1866 selection: I,
1867 schema: &Schema,
1868 ) -> PolarsResult<Self>
1869 where
1870 I: IntoIterator<Item = S>,
1871 S: Into<PlSmallStr>,
1872 {
1873 let cols: UnitVec<PlSmallStr> = selection.into_iter().map(|s| s.into()).collect();
1874 self._select_with_schema_impl(&cols, schema, false)
1875 }
1876
1877 /// * The schema names must match the column names of this DataFrame.
1878 pub fn _select_with_schema_impl(
1879 &self,
1880 cols: &[PlSmallStr],
1881 schema: &Schema,
1882 check_duplicates: bool,
1883 ) -> PolarsResult<Self> {
1884 if check_duplicates {
1885 ensure_names_unique(cols, |s| s.as_str())?;
1886 }
1887
1888 let selected = self.select_columns_impl_with_schema(cols, schema)?;
1889 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1890 }
1891
1892 /// A non generic implementation to reduce compiler bloat.
1893 fn select_columns_impl_with_schema(
1894 &self,
1895 cols: &[PlSmallStr],
1896 schema: &Schema,
1897 ) -> PolarsResult<Vec<Column>> {
1898 if cfg!(debug_assertions) {
1899 ensure_matching_schema_names(schema, self.schema())?;
1900 }
1901
1902 cols.iter()
1903 .map(|name| {
1904 let index = schema.try_get_full(name.as_str())?.0;
1905 Ok(self.columns[index].clone())
1906 })
1907 .collect()
1908 }
1909
1910 pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>
1911 where
1912 I: IntoIterator<Item = S>,
1913 S: Into<PlSmallStr>,
1914 {
1915 let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();
1916 self.select_physical_impl(&cols)
1917 }
1918
1919 fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {
1920 ensure_names_unique(cols, |s| s.as_str())?;
1921 let selected = self.select_columns_physical_impl(cols)?;
1922 Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })
1923 }
1924
1925 pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {
1926 let mut df = self.project_names(to.iter_names())?;
1927 df.cached_schema = to.into();
1928 Ok(df)
1929 }
1930
1931 pub fn project_names(
1932 &self,
1933 names: impl IntoIterator<Item = impl AsRef<str>>,
1934 ) -> PolarsResult<Self> {
1935 let from = self.schema();
1936 let columns = names
1937 .into_iter()
1938 .map(|name| Ok(self.columns[from.try_index_of(name.as_ref())?].clone()))
1939 .collect::<PolarsResult<Vec<_>>>()?;
1940 let df = unsafe { Self::new_no_checks(self.height(), columns) };
1941 Ok(df)
1942 }
1943
1944 /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].
1945 ///
1946 /// # Example
1947 ///
1948 /// ```rust
1949 /// # use polars_core::prelude::*;
1950 /// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],
1951 /// "Carbon" => [1, 2, 3],
1952 /// "Hydrogen" => [4, 6, 8])?;
1953 /// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;
1954 ///
1955 /// assert_eq!(df["Carbon"], sv[0]);
1956 /// assert_eq!(df["Hydrogen"], sv[1]);
1957 /// # Ok::<(), PolarsError>(())
1958 /// ```
1959 pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {
1960 let cols = selection.into_vec();
1961 self.select_columns_impl(&cols)
1962 }
1963
1964 fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {
1965 self.columns
1966 .iter()
1967 .enumerate()
1968 .map(|(i, s)| (s.name().as_str(), i))
1969 .collect()
1970 }
1971
1972 /// A non generic implementation to reduce compiler bloat.
1973 fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1974 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1975 let name_to_idx = self._names_to_idx_map();
1976 cols.iter()
1977 .map(|name| {
1978 let idx = *name_to_idx
1979 .get(name.as_str())
1980 .ok_or_else(|| polars_err!(col_not_found = name))?;
1981 Ok(self.select_at_idx(idx).unwrap().to_physical_repr())
1982 })
1983 .collect::<PolarsResult<Vec<_>>>()?
1984 } else {
1985 cols.iter()
1986 .map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))
1987 .collect::<PolarsResult<Vec<_>>>()?
1988 };
1989
1990 Ok(selected)
1991 }
1992
1993 /// A non generic implementation to reduce compiler bloat.
1994 fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {
1995 let selected = if cols.len() > 1 && self.columns.len() > 10 {
1996 // we hash, because there are user that having millions of columns.
1997 // # https://github.com/pola-rs/polars/issues/1023
1998 let name_to_idx = self._names_to_idx_map();
1999
2000 cols.iter()
2001 .map(|name| {
2002 let idx = *name_to_idx
2003 .get(name.as_str())
2004 .ok_or_else(|| polars_err!(col_not_found = name))?;
2005 Ok(self.select_at_idx(idx).unwrap().clone())
2006 })
2007 .collect::<PolarsResult<Vec<_>>>()?
2008 } else {
2009 cols.iter()
2010 .map(|c| self.column(c.as_str()).cloned())
2011 .collect::<PolarsResult<Vec<_>>>()?
2012 };
2013
2014 Ok(selected)
2015 }
2016
2017 fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {
2018 // If there is a filtered column just see how many columns there are left.
2019 if let Some(fst) = filtered.first() {
2020 return fst.len();
2021 }
2022
2023 // Otherwise, count the number of values that would be filtered and return that height.
2024 let num_trues = mask.num_trues();
2025 if mask.len() == self.height() {
2026 num_trues
2027 } else {
2028 // This is for broadcasting masks
2029 debug_assert!(num_trues == 0 || num_trues == 1);
2030 self.height() * num_trues
2031 }
2032 }
2033
2034 /// Take the [`DataFrame`] rows by a boolean mask.
2035 ///
2036 /// # Example
2037 ///
2038 /// ```
2039 /// # use polars_core::prelude::*;
2040 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2041 /// let mask = df.column("sepal_width")?.is_not_null();
2042 /// df.filter(&mask)
2043 /// }
2044 /// ```
2045 pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2046 let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;
2047 let height = self.filter_height(&new_col, mask);
2048
2049 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2050 }
2051
2052 /// Same as `filter` but does not parallelize.
2053 pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {
2054 let new_col = self.try_apply_columns(&|s| s.filter(mask))?;
2055 let height = self.filter_height(&new_col, mask);
2056
2057 Ok(unsafe { DataFrame::new_no_checks(height, new_col) })
2058 }
2059
2060 /// Take [`DataFrame`] rows by index values.
2061 ///
2062 /// # Example
2063 ///
2064 /// ```
2065 /// # use polars_core::prelude::*;
2066 /// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {
2067 /// let idx = IdxCa::new("idx".into(), [0, 1, 9]);
2068 /// df.take(&idx)
2069 /// }
2070 /// ```
2071 pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {
2072 let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;
2073
2074 Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })
2075 }
2076
2077 /// # Safety
2078 /// The indices must be in-bounds.
2079 pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {
2080 self.take_unchecked_impl(idx, true)
2081 }
2082
2083 /// # Safety
2084 /// The indices must be in-bounds.
2085 pub unsafe fn gather_group_unchecked(&self, group: &GroupsIndicator) -> Self {
2086 match group {
2087 GroupsIndicator::Idx((_, indices)) => unsafe {
2088 self.take_slice_unchecked_impl(indices.as_slice(), false)
2089 },
2090 GroupsIndicator::Slice([offset, len]) => self.slice(*offset as i64, *len as usize),
2091 }
2092 }
2093
2094 /// # Safety
2095 /// The indices must be in-bounds.
2096 pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {
2097 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2098 POOL.install(|| {
2099 if POOL.current_num_threads() > self.width() {
2100 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2101 if self.len() / stride >= 2 {
2102 self._apply_columns_par(&|c| {
2103 // Nested types initiate a rechunk in their take_unchecked implementation.
2104 // If we do not rechunk, it will result in rechunk storms downstream.
2105 let c = if c.dtype().is_nested() {
2106 &c.rechunk()
2107 } else {
2108 c
2109 };
2110
2111 (0..idx.len().div_ceil(stride))
2112 .into_par_iter()
2113 .map(|i| c.take_unchecked(&idx.slice((i * stride) as i64, stride)))
2114 .reduce(
2115 || Column::new_empty(c.name().clone(), c.dtype()),
2116 |mut a, b| {
2117 a.append_owned(b).unwrap();
2118 a
2119 },
2120 )
2121 })
2122 } else {
2123 self._apply_columns_par(&|c| c.take_unchecked(idx))
2124 }
2125 } else {
2126 self._apply_columns_par(&|c| c.take_unchecked(idx))
2127 }
2128 })
2129 } else {
2130 self._apply_columns(&|s| s.take_unchecked(idx))
2131 };
2132 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2133 }
2134
2135 /// # Safety
2136 /// The indices must be in-bounds.
2137 pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {
2138 self.take_slice_unchecked_impl(idx, true)
2139 }
2140
2141 /// # Safety
2142 /// The indices must be in-bounds.
2143 pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
2144 let cols = if allow_threads && POOL.current_num_threads() > 1 {
2145 POOL.install(|| {
2146 if POOL.current_num_threads() > self.width() {
2147 let stride = usize::max(idx.len().div_ceil(POOL.current_num_threads()), 256);
2148 if self.len() / stride >= 2 {
2149 self._apply_columns_par(&|c| {
2150 // Nested types initiate a rechunk in their take_unchecked implementation.
2151 // If we do not rechunk, it will result in rechunk storms downstream.
2152 let c = if c.dtype().is_nested() {
2153 &c.rechunk()
2154 } else {
2155 c
2156 };
2157
2158 (0..idx.len().div_ceil(stride))
2159 .into_par_iter()
2160 .map(|i| {
2161 let idx = &idx[i * stride..];
2162 let idx = &idx[..idx.len().min(stride)];
2163 c.take_slice_unchecked(idx)
2164 })
2165 .reduce(
2166 || Column::new_empty(c.name().clone(), c.dtype()),
2167 |mut a, b| {
2168 a.append_owned(b).unwrap();
2169 a
2170 },
2171 )
2172 })
2173 } else {
2174 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2175 }
2176 } else {
2177 self._apply_columns_par(&|s| s.take_slice_unchecked(idx))
2178 }
2179 })
2180 } else {
2181 self._apply_columns(&|s| s.take_slice_unchecked(idx))
2182 };
2183 unsafe { DataFrame::new_no_checks(idx.len(), cols) }
2184 }
2185
2186 /// Rename a column in the [`DataFrame`].
2187 ///
2188 /// Should not be called in a loop as that can lead to quadratic behavior.
2189 ///
2190 /// # Example
2191 ///
2192 /// ```
2193 /// # use polars_core::prelude::*;
2194 /// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {
2195 /// let original_name = "foo";
2196 /// let new_name = "bar";
2197 /// df.rename(original_name, new_name.into())
2198 /// }
2199 /// ```
2200 pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {
2201 if column == name.as_str() {
2202 return Ok(self);
2203 }
2204 polars_ensure!(
2205 !self.schema().contains(&name),
2206 Duplicate: "column rename attempted with already existing name \"{name}\""
2207 );
2208
2209 self.get_column_index(column)
2210 .and_then(|idx| self.columns.get_mut(idx))
2211 .ok_or_else(|| polars_err!(col_not_found = column))
2212 .map(|c| c.rename(name))?;
2213 self.clear_schema();
2214
2215 Ok(self)
2216 }
2217
2218 pub fn rename_many<'a>(
2219 &mut self,
2220 renames: impl Iterator<Item = (&'a str, PlSmallStr)>,
2221 ) -> PolarsResult<&mut Self> {
2222 let mut schema = self.schema().as_ref().clone();
2223 self.clear_schema();
2224
2225 for (from, to) in renames {
2226 if from == to.as_str() {
2227 continue;
2228 }
2229
2230 polars_ensure!(
2231 !schema.contains(&to),
2232 Duplicate: "column rename attempted with already existing name \"{to}\""
2233 );
2234
2235 match schema.get_full(from) {
2236 None => polars_bail!(col_not_found = from),
2237 Some((idx, _, _)) => {
2238 let (n, _) = schema.get_at_index_mut(idx).unwrap();
2239 *n = to.clone();
2240 self.columns.get_mut(idx).unwrap().rename(to);
2241 },
2242 }
2243 }
2244
2245 self.cached_schema = OnceLock::from(Arc::new(schema));
2246 Ok(self)
2247 }
2248
2249 /// Sort [`DataFrame`] in place.
2250 ///
2251 /// See [`DataFrame::sort`] for more instruction.
2252 pub fn sort_in_place(
2253 &mut self,
2254 by: impl IntoVec<PlSmallStr>,
2255 sort_options: SortMultipleOptions,
2256 ) -> PolarsResult<&mut Self> {
2257 let by_column = self.select_columns(by)?;
2258 self.columns = self.sort_impl(by_column, sort_options, None)?.columns;
2259 Ok(self)
2260 }
2261
2262 #[doc(hidden)]
2263 /// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.
2264 pub fn sort_impl(
2265 &self,
2266 by_column: Vec<Column>,
2267 sort_options: SortMultipleOptions,
2268 slice: Option<(i64, usize)>,
2269 ) -> PolarsResult<Self> {
2270 if by_column.is_empty() {
2271 // If no columns selected, any order (including original order) is correct.
2272 return if let Some((offset, len)) = slice {
2273 Ok(self.slice(offset, len))
2274 } else {
2275 Ok(self.clone())
2276 };
2277 }
2278
2279 // note that the by_column argument also contains evaluated expression from
2280 // polars-lazy that may not even be present in this dataframe. therefore
2281 // when we try to set the first columns as sorted, we ignore the error as
2282 // expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.
2283 let first_descending = sort_options.descending[0];
2284 let first_by_column = by_column[0].name().to_string();
2285
2286 let set_sorted = |df: &mut DataFrame| {
2287 // Mark the first sort column as sorted; if the column does not exist it
2288 // is ok, because we sorted by an expression not present in the dataframe
2289 let _ = df.apply(&first_by_column, |s| {
2290 let mut s = s.clone();
2291 if first_descending {
2292 s.set_sorted_flag(IsSorted::Descending)
2293 } else {
2294 s.set_sorted_flag(IsSorted::Ascending)
2295 }
2296 s
2297 });
2298 };
2299 if self.is_empty() {
2300 let mut out = self.clone();
2301 set_sorted(&mut out);
2302 return Ok(out);
2303 }
2304
2305 if let Some((0, k)) = slice {
2306 if k < self.len() {
2307 return self.bottom_k_impl(k, by_column, sort_options);
2308 }
2309 }
2310 // Check if the required column is already sorted; if so we can exit early
2311 // We can do so when there is only one column to sort by, for multiple columns
2312 // it will be complicated to do so
2313 #[cfg(feature = "dtype-categorical")]
2314 let is_not_categorical_enum =
2315 !(matches!(by_column[0].dtype(), DataType::Categorical(_, _))
2316 || matches!(by_column[0].dtype(), DataType::Enum(_, _)));
2317
2318 #[cfg(not(feature = "dtype-categorical"))]
2319 #[allow(non_upper_case_globals)]
2320 const is_not_categorical_enum: bool = true;
2321
2322 if by_column.len() == 1 && is_not_categorical_enum {
2323 let required_sorting = if sort_options.descending[0] {
2324 IsSorted::Descending
2325 } else {
2326 IsSorted::Ascending
2327 };
2328 // If null count is 0 then nulls_last doesnt matter
2329 // Safe to get value at last position since the dataframe is not empty (taken care above)
2330 let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)
2331 && ((by_column[0].null_count() == 0)
2332 || by_column[0].get(by_column[0].len() - 1).unwrap().is_null()
2333 == sort_options.nulls_last[0]);
2334
2335 if no_sorting_required {
2336 return if let Some((offset, len)) = slice {
2337 Ok(self.slice(offset, len))
2338 } else {
2339 Ok(self.clone())
2340 };
2341 }
2342 }
2343
2344 let has_nested = by_column.iter().any(|s| s.dtype().is_nested());
2345 let allow_threads = sort_options.multithreaded;
2346
2347 // a lot of indirection in both sorting and take
2348 let mut df = self.clone();
2349 let df = df.as_single_chunk_par();
2350 let mut take = match (by_column.len(), has_nested) {
2351 (1, false) => {
2352 let s = &by_column[0];
2353 let options = SortOptions {
2354 descending: sort_options.descending[0],
2355 nulls_last: sort_options.nulls_last[0],
2356 multithreaded: sort_options.multithreaded,
2357 maintain_order: sort_options.maintain_order,
2358 limit: sort_options.limit,
2359 };
2360 // fast path for a frame with a single series
2361 // no need to compute the sort indices and then take by these indices
2362 // simply sort and return as frame
2363 if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {
2364 let mut out = s.sort_with(options)?;
2365 if let Some((offset, len)) = slice {
2366 out = out.slice(offset, len);
2367 }
2368 return Ok(out.into_frame());
2369 }
2370 s.arg_sort(options)
2371 },
2372 _ => arg_sort(&by_column, sort_options)?,
2373 };
2374
2375 if let Some((offset, len)) = slice {
2376 take = take.slice(offset, len);
2377 }
2378
2379 // SAFETY:
2380 // the created indices are in bounds
2381 let mut df = unsafe { df.take_unchecked_impl(&take, allow_threads) };
2382 set_sorted(&mut df);
2383 Ok(df)
2384 }
2385
2386 /// Create a `DataFrame` that has fields for all the known runtime metadata for each column.
2387 ///
2388 /// This dataframe does not necessarily have a specified schema and may be changed at any
2389 /// point. It is primarily used for debugging.
2390 pub fn _to_metadata(&self) -> DataFrame {
2391 let num_columns = self.columns.len();
2392
2393 let mut column_names =
2394 StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);
2395 let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);
2396 let mut sorted_asc_ca =
2397 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);
2398 let mut sorted_dsc_ca =
2399 BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);
2400 let mut fast_explode_list_ca =
2401 BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);
2402 let mut materialized_at_ca =
2403 StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);
2404
2405 for col in &self.columns {
2406 let flags = col.get_flags();
2407
2408 let (repr, materialized_at) = match col {
2409 Column::Series(s) => ("series", s.materialized_at()),
2410 Column::Scalar(_) => ("scalar", None),
2411 };
2412 let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);
2413 let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);
2414 let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);
2415
2416 column_names.append_value(col.name().clone());
2417 repr_ca.append_value(repr);
2418 sorted_asc_ca.append_value(sorted_asc);
2419 sorted_dsc_ca.append_value(sorted_dsc);
2420 fast_explode_list_ca.append_value(fast_explode_list);
2421 materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));
2422 }
2423
2424 unsafe {
2425 DataFrame::new_no_checks(
2426 self.width(),
2427 vec![
2428 column_names.finish().into_column(),
2429 repr_ca.finish().into_column(),
2430 sorted_asc_ca.finish().into_column(),
2431 sorted_dsc_ca.finish().into_column(),
2432 fast_explode_list_ca.finish().into_column(),
2433 materialized_at_ca.finish().into_column(),
2434 ],
2435 )
2436 }
2437 }
2438
2439 /// Return a sorted clone of this [`DataFrame`].
2440 ///
2441 /// In many cases the output chunks will be continuous in memory but this is not guaranteed
2442 /// # Example
2443 ///
2444 /// Sort by a single column with default options:
2445 /// ```
2446 /// # use polars_core::prelude::*;
2447 /// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {
2448 /// df.sort(["sepal_width"], Default::default())
2449 /// }
2450 /// ```
2451 /// Sort by a single column with specific order:
2452 /// ```
2453 /// # use polars_core::prelude::*;
2454 /// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {
2455 /// df.sort(
2456 /// ["sepal_width"],
2457 /// SortMultipleOptions::new()
2458 /// .with_order_descending(descending)
2459 /// )
2460 /// }
2461 /// ```
2462 /// Sort by multiple columns with specifying order for each column:
2463 /// ```
2464 /// # use polars_core::prelude::*;
2465 /// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {
2466 /// df.sort(
2467 /// ["sepal_width", "sepal_length"],
2468 /// SortMultipleOptions::new()
2469 /// .with_order_descending_multi([false, true])
2470 /// )
2471 /// }
2472 /// ```
2473 /// See [`SortMultipleOptions`] for more options.
2474 ///
2475 /// Also see [`DataFrame::sort_in_place`].
2476 pub fn sort(
2477 &self,
2478 by: impl IntoVec<PlSmallStr>,
2479 sort_options: SortMultipleOptions,
2480 ) -> PolarsResult<Self> {
2481 let mut df = self.clone();
2482 df.sort_in_place(by, sort_options)?;
2483 Ok(df)
2484 }
2485
2486 /// Replace a column with a [`Series`].
2487 ///
2488 /// # Example
2489 ///
2490 /// ```rust
2491 /// # use polars_core::prelude::*;
2492 /// let mut df: DataFrame = df!("Country" => ["United States", "China"],
2493 /// "Area (km²)" => [9_833_520, 9_596_961])?;
2494 /// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);
2495 ///
2496 /// assert!(df.replace("Nation", s.clone()).is_err());
2497 /// assert!(df.replace("Country", s).is_ok());
2498 /// # Ok::<(), PolarsError>(())
2499 /// ```
2500 pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {
2501 self.apply(column, |_| new_col.into_series())
2502 }
2503
2504 /// Replace or update a column. The difference between this method and [DataFrame::with_column]
2505 /// is that now the value of `column: &str` determines the name of the column and not the name
2506 /// of the `Series` passed to this method.
2507 pub fn replace_or_add<S: IntoSeries>(
2508 &mut self,
2509 column: PlSmallStr,
2510 new_col: S,
2511 ) -> PolarsResult<&mut Self> {
2512 let mut new_col = new_col.into_series();
2513 new_col.rename(column);
2514 self.with_column(new_col)
2515 }
2516
2517 /// Replace column at index `idx` with a [`Series`].
2518 ///
2519 /// # Example
2520 ///
2521 /// ```ignored
2522 /// # use polars_core::prelude::*;
2523 /// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);
2524 /// let s1 = Series::new("ascii".into(), [70, 79, 79]);
2525 /// let mut df = DataFrame::new(vec![s0, s1])?;
2526 ///
2527 /// // Add 32 to get lowercase ascii values
2528 /// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);
2529 /// # Ok::<(), PolarsError>(())
2530 /// ```
2531 pub fn replace_column<C: IntoColumn>(
2532 &mut self,
2533 index: usize,
2534 new_column: C,
2535 ) -> PolarsResult<&mut Self> {
2536 polars_ensure!(
2537 index < self.width(),
2538 ShapeMismatch:
2539 "unable to replace at index {}, the DataFrame has only {} columns",
2540 index, self.width(),
2541 );
2542 let mut new_column = new_column.into_column();
2543 polars_ensure!(
2544 new_column.len() == self.height(),
2545 ShapeMismatch:
2546 "unable to replace a column, series length {} doesn't match the DataFrame height {}",
2547 new_column.len(), self.height(),
2548 );
2549 let old_col = &mut self.columns[index];
2550 mem::swap(old_col, &mut new_column);
2551 self.clear_schema();
2552 Ok(self)
2553 }
2554
2555 /// Apply a closure to a column. This is the recommended way to do in place modification.
2556 ///
2557 /// # Example
2558 ///
2559 /// ```rust
2560 /// # use polars_core::prelude::*;
2561 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2562 /// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);
2563 /// let mut df = DataFrame::new(vec![s0, s1])?;
2564 ///
2565 /// fn str_to_len(str_val: &Column) -> Column {
2566 /// str_val.str()
2567 /// .unwrap()
2568 /// .into_iter()
2569 /// .map(|opt_name: Option<&str>| {
2570 /// opt_name.map(|name: &str| name.len() as u32)
2571 /// })
2572 /// .collect::<UInt32Chunked>()
2573 /// .into_column()
2574 /// }
2575 ///
2576 /// // Replace the names column by the length of the names.
2577 /// df.apply("names", str_to_len);
2578 /// # Ok::<(), PolarsError>(())
2579 /// ```
2580 /// Results in:
2581 ///
2582 /// ```text
2583 /// +--------+-------+
2584 /// | foo | |
2585 /// | --- | names |
2586 /// | str | u32 |
2587 /// +========+=======+
2588 /// | "ham" | 4 |
2589 /// +--------+-------+
2590 /// | "spam" | 6 |
2591 /// +--------+-------+
2592 /// | "egg" | 3 |
2593 /// +--------+-------+
2594 /// ```
2595 pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>
2596 where
2597 F: FnOnce(&Column) -> C,
2598 C: IntoColumn,
2599 {
2600 let idx = self.check_name_to_idx(name)?;
2601 self.apply_at_idx(idx, f)?;
2602 Ok(self)
2603 }
2604
2605 /// Apply a closure to a column at index `idx`. This is the recommended way to do in place
2606 /// modification.
2607 ///
2608 /// # Example
2609 ///
2610 /// ```rust
2611 /// # use polars_core::prelude::*;
2612 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);
2613 /// let s1 = Column::new("ascii".into(), [70, 79, 79]);
2614 /// let mut df = DataFrame::new(vec![s0, s1])?;
2615 ///
2616 /// // Add 32 to get lowercase ascii values
2617 /// df.apply_at_idx(1, |s| s + 32);
2618 /// # Ok::<(), PolarsError>(())
2619 /// ```
2620 /// Results in:
2621 ///
2622 /// ```text
2623 /// +--------+-------+
2624 /// | foo | ascii |
2625 /// | --- | --- |
2626 /// | str | i32 |
2627 /// +========+=======+
2628 /// | "ham" | 102 |
2629 /// +--------+-------+
2630 /// | "spam" | 111 |
2631 /// +--------+-------+
2632 /// | "egg" | 111 |
2633 /// +--------+-------+
2634 /// ```
2635 pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2636 where
2637 F: FnOnce(&Column) -> C,
2638 C: IntoColumn,
2639 {
2640 let df_height = self.height();
2641 let width = self.width();
2642 let col = self.columns.get_mut(idx).ok_or_else(|| {
2643 polars_err!(
2644 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2645 idx, width
2646 )
2647 })?;
2648 let name = col.name().clone();
2649 let dtype_before = col.dtype().clone();
2650 let new_col = f(col).into_column();
2651 match new_col.len() {
2652 1 => {
2653 let new_col = new_col.new_from_index(0, df_height);
2654 let _ = mem::replace(col, new_col);
2655 },
2656 len if (len == df_height) => {
2657 let _ = mem::replace(col, new_col);
2658 },
2659 len => polars_bail!(
2660 ShapeMismatch:
2661 "resulting Series has length {} while the DataFrame has height {}",
2662 len, df_height
2663 ),
2664 }
2665
2666 // make sure the name remains the same after applying the closure
2667 unsafe {
2668 let col = self.columns.get_unchecked_mut(idx);
2669 col.rename(name);
2670
2671 if col.dtype() != &dtype_before {
2672 self.clear_schema();
2673 }
2674 }
2675 Ok(self)
2676 }
2677
2678 /// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place
2679 /// modification.
2680 ///
2681 /// # Example
2682 ///
2683 /// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.
2684 ///
2685 /// ```rust
2686 /// # use polars_core::prelude::*;
2687 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2688 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2689 /// let mut df = DataFrame::new(vec![s0, s1])?;
2690 ///
2691 /// let idx = vec![0, 1, 4];
2692 ///
2693 /// df.try_apply("foo", |c| {
2694 /// c.str()?
2695 /// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))
2696 /// });
2697 /// # Ok::<(), PolarsError>(())
2698 /// ```
2699 /// Results in:
2700 ///
2701 /// ```text
2702 /// +---------------------+--------+
2703 /// | foo | values |
2704 /// | --- | --- |
2705 /// | str | i32 |
2706 /// +=====================+========+
2707 /// | "ham-is-modified" | 1 |
2708 /// +---------------------+--------+
2709 /// | "spam-is-modified" | 2 |
2710 /// +---------------------+--------+
2711 /// | "egg" | 3 |
2712 /// +---------------------+--------+
2713 /// | "bacon" | 4 |
2714 /// +---------------------+--------+
2715 /// | "quack-is-modified" | 5 |
2716 /// +---------------------+--------+
2717 /// ```
2718 pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>
2719 where
2720 F: FnOnce(&Column) -> PolarsResult<C>,
2721 C: IntoColumn,
2722 {
2723 let width = self.width();
2724 let col = self.columns.get_mut(idx).ok_or_else(|| {
2725 polars_err!(
2726 ComputeError: "invalid column index: {} for a DataFrame with {} columns",
2727 idx, width
2728 )
2729 })?;
2730 let name = col.name().clone();
2731
2732 let _ = mem::replace(col, f(col).map(|c| c.into_column())?);
2733
2734 // make sure the name remains the same after applying the closure
2735 unsafe {
2736 let col = self.columns.get_unchecked_mut(idx);
2737 col.rename(name);
2738 }
2739 Ok(self)
2740 }
2741
2742 /// Apply a closure that may fail to a column. This is the recommended way to do in place
2743 /// modification.
2744 ///
2745 /// # Example
2746 ///
2747 /// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.
2748 ///
2749 /// ```rust
2750 /// # use polars_core::prelude::*;
2751 /// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);
2752 /// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);
2753 /// let mut df = DataFrame::new(vec![s0, s1])?;
2754 ///
2755 /// // create a mask
2756 /// let values = df.column("values")?.as_materialized_series();
2757 /// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;
2758 ///
2759 /// df.try_apply("foo", |c| {
2760 /// c.str()?
2761 /// .set(&mask, Some("not_within_bounds"))
2762 /// });
2763 /// # Ok::<(), PolarsError>(())
2764 /// ```
2765 /// Results in:
2766 ///
2767 /// ```text
2768 /// +---------------------+--------+
2769 /// | foo | values |
2770 /// | --- | --- |
2771 /// | str | i32 |
2772 /// +=====================+========+
2773 /// | "not_within_bounds" | 1 |
2774 /// +---------------------+--------+
2775 /// | "spam" | 2 |
2776 /// +---------------------+--------+
2777 /// | "egg" | 3 |
2778 /// +---------------------+--------+
2779 /// | "bacon" | 4 |
2780 /// +---------------------+--------+
2781 /// | "not_within_bounds" | 5 |
2782 /// +---------------------+--------+
2783 /// ```
2784 pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>
2785 where
2786 F: FnOnce(&Series) -> PolarsResult<C>,
2787 C: IntoColumn,
2788 {
2789 let idx = self.try_get_column_index(column)?;
2790 self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))
2791 }
2792
2793 /// Slice the [`DataFrame`] along the rows.
2794 ///
2795 /// # Example
2796 ///
2797 /// ```rust
2798 /// # use polars_core::prelude::*;
2799 /// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],
2800 /// "Color" => ["Green", "Red", "White", "White", "Red"])?;
2801 /// let sl: DataFrame = df.slice(2, 3);
2802 ///
2803 /// assert_eq!(sl.shape(), (3, 2));
2804 /// println!("{}", sl);
2805 /// # Ok::<(), PolarsError>(())
2806 /// ```
2807 /// Output:
2808 /// ```text
2809 /// shape: (3, 2)
2810 /// +-------+-------+
2811 /// | Fruit | Color |
2812 /// | --- | --- |
2813 /// | str | str |
2814 /// +=======+=======+
2815 /// | Grape | White |
2816 /// +-------+-------+
2817 /// | Fig | White |
2818 /// +-------+-------+
2819 /// | Fig | Red |
2820 /// +-------+-------+
2821 /// ```
2822 #[must_use]
2823 pub fn slice(&self, offset: i64, length: usize) -> Self {
2824 if offset == 0 && length == self.height() {
2825 return self.clone();
2826 }
2827 if length == 0 {
2828 return self.clear();
2829 }
2830 let col = self
2831 .columns
2832 .iter()
2833 .map(|s| s.slice(offset, length))
2834 .collect::<Vec<_>>();
2835
2836 let height = if let Some(fst) = col.first() {
2837 fst.len()
2838 } else {
2839 let (_, length) = slice_offsets(offset, length, self.height());
2840 length
2841 };
2842
2843 unsafe { DataFrame::new_no_checks(height, col) }
2844 }
2845
2846 /// Split [`DataFrame`] at the given `offset`.
2847 pub fn split_at(&self, offset: i64) -> (Self, Self) {
2848 let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2849
2850 let (idx, _) = slice_offsets(offset, 0, self.height());
2851
2852 let a = unsafe { DataFrame::new_no_checks(idx, a) };
2853 let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };
2854 (a, b)
2855 }
2856
2857 #[must_use]
2858 pub fn clear(&self) -> Self {
2859 let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
2860 unsafe { DataFrame::new_no_checks(0, col) }
2861 }
2862
2863 #[must_use]
2864 pub fn slice_par(&self, offset: i64, length: usize) -> Self {
2865 if offset == 0 && length == self.height() {
2866 return self.clone();
2867 }
2868 let columns = self._apply_columns_par(&|s| s.slice(offset, length));
2869 unsafe { DataFrame::new_no_checks(length, columns) }
2870 }
2871
2872 #[must_use]
2873 pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {
2874 if offset == 0 && length == self.height() {
2875 return self.clone();
2876 }
2877 // @scalar-opt
2878 let columns = self._apply_columns(&|s| {
2879 let mut out = s.slice(offset, length);
2880 out.shrink_to_fit();
2881 out
2882 });
2883 unsafe { DataFrame::new_no_checks(length, columns) }
2884 }
2885
2886 /// Get the head of the [`DataFrame`].
2887 ///
2888 /// # Example
2889 ///
2890 /// ```rust
2891 /// # use polars_core::prelude::*;
2892 /// let countries: DataFrame =
2893 /// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],
2894 /// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],
2895 /// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],
2896 /// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;
2897 /// assert_eq!(countries.shape(), (5, 4));
2898 ///
2899 /// println!("{}", countries.head(Some(3)));
2900 /// # Ok::<(), PolarsError>(())
2901 /// ```
2902 ///
2903 /// Output:
2904 ///
2905 /// ```text
2906 /// shape: (3, 4)
2907 /// +--------------------+---------------+---------------+------------+
2908 /// | Rank by GDP (2021) | Continent | Country | Capital |
2909 /// | --- | --- | --- | --- |
2910 /// | i32 | str | str | str |
2911 /// +====================+===============+===============+============+
2912 /// | 1 | North America | United States | Washington |
2913 /// +--------------------+---------------+---------------+------------+
2914 /// | 2 | Asia | China | Beijing |
2915 /// +--------------------+---------------+---------------+------------+
2916 /// | 3 | Asia | Japan | Tokyo |
2917 /// +--------------------+---------------+---------------+------------+
2918 /// ```
2919 #[must_use]
2920 pub fn head(&self, length: Option<usize>) -> Self {
2921 let col = self
2922 .columns
2923 .iter()
2924 .map(|c| c.head(length))
2925 .collect::<Vec<_>>();
2926
2927 let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);
2928 let height = usize::min(height, self.height());
2929 unsafe { DataFrame::new_no_checks(height, col) }
2930 }
2931
2932 /// Get the tail of the [`DataFrame`].
2933 ///
2934 /// # Example
2935 ///
2936 /// ```rust
2937 /// # use polars_core::prelude::*;
2938 /// let countries: DataFrame =
2939 /// df!("Rank (2021)" => [105, 106, 107, 108, 109],
2940 /// "Apple Price (ā¬/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],
2941 /// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;
2942 /// assert_eq!(countries.shape(), (5, 3));
2943 ///
2944 /// println!("{}", countries.tail(Some(2)));
2945 /// # Ok::<(), PolarsError>(())
2946 /// ```
2947 ///
2948 /// Output:
2949 ///
2950 /// ```text
2951 /// shape: (2, 3)
2952 /// +-------------+--------------------+---------+
2953 /// | Rank (2021) | Apple Price (ā¬/kg) | Country |
2954 /// | --- | --- | --- |
2955 /// | i32 | f64 | str |
2956 /// +=============+====================+=========+
2957 /// | 108 | 0.65 | Syria |
2958 /// +-------------+--------------------+---------+
2959 /// | 109 | 0.52 | Turkey |
2960 /// +-------------+--------------------+---------+
2961 /// ```
2962 #[must_use]
2963 pub fn tail(&self, length: Option<usize>) -> Self {
2964 let col = self
2965 .columns
2966 .iter()
2967 .map(|c| c.tail(length))
2968 .collect::<Vec<_>>();
2969
2970 let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);
2971 let height = usize::min(height, self.height());
2972 unsafe { DataFrame::new_no_checks(height, col) }
2973 }
2974
2975 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.
2976 ///
2977 /// # Panics
2978 ///
2979 /// Panics if the [`DataFrame`] that is passed is not rechunked.
2980 ///
2981 /// This responsibility is left to the caller as we don't want to take mutable references here,
2982 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
2983 /// as well.
2984 pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {
2985 debug_assert!(!self.should_rechunk(), "expected equal chunks");
2986 // If any of the columns is binview and we don't convert `compat_level` we allow parallelism
2987 // as we must allocate arrow strings/binaries.
2988 let must_convert = compat_level.0 == 0;
2989 let parallel = parallel
2990 && must_convert
2991 && self.columns.len() > 1
2992 && self
2993 .columns
2994 .iter()
2995 .any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));
2996
2997 RecordBatchIter {
2998 columns: &self.columns,
2999 schema: Arc::new(
3000 self.columns
3001 .iter()
3002 .map(|c| c.field().to_arrow(compat_level))
3003 .collect(),
3004 ),
3005 idx: 0,
3006 n_chunks: self.first_col_n_chunks(),
3007 compat_level,
3008 parallel,
3009 }
3010 }
3011
3012 /// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.
3013 ///
3014 /// # Panics
3015 ///
3016 /// Panics if the [`DataFrame`] that is passed is not rechunked.
3017 ///
3018 /// This responsibility is left to the caller as we don't want to take mutable references here,
3019 /// but we also don't want to rechunk here, as this operation is costly and would benefit the caller
3020 /// as well.
3021 pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {
3022 debug_assert!(!self.should_rechunk());
3023 PhysRecordBatchIter {
3024 schema: Arc::new(
3025 self.get_columns()
3026 .iter()
3027 .map(|c| c.field().to_arrow(CompatLevel::newest()))
3028 .collect(),
3029 ),
3030 arr_iters: self
3031 .materialized_column_iter()
3032 .map(|s| s.chunks().iter())
3033 .collect(),
3034 }
3035 }
3036
3037 /// Get a [`DataFrame`] with all the columns in reversed order.
3038 #[must_use]
3039 pub fn reverse(&self) -> Self {
3040 let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();
3041 unsafe { DataFrame::new_no_checks(self.height(), col) }
3042 }
3043
3044 /// Shift the values by a given period and fill the parts that will be empty due to this operation
3045 /// with `Nones`.
3046 ///
3047 /// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.
3048 #[must_use]
3049 pub fn shift(&self, periods: i64) -> Self {
3050 let col = self._apply_columns_par(&|s| s.shift(periods));
3051 unsafe { DataFrame::new_no_checks(self.height(), col) }
3052 }
3053
3054 /// Replace None values with one of the following strategies:
3055 /// * Forward fill (replace None with the previous value)
3056 /// * Backward fill (replace None with the next value)
3057 /// * Mean fill (replace None with the mean of the whole array)
3058 /// * Min fill (replace None with the minimum of the whole array)
3059 /// * Max fill (replace None with the maximum of the whole array)
3060 ///
3061 /// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.
3062 pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {
3063 let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;
3064
3065 Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })
3066 }
3067
3068 /// Pipe different functions/ closure operations that work on a DataFrame together.
3069 pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>
3070 where
3071 F: Fn(DataFrame) -> PolarsResult<B>,
3072 {
3073 f(self)
3074 }
3075
3076 /// Pipe different functions/ closure operations that work on a DataFrame together.
3077 pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>
3078 where
3079 F: Fn(&mut DataFrame) -> PolarsResult<B>,
3080 {
3081 f(self)
3082 }
3083
3084 /// Pipe different functions/ closure operations that work on a DataFrame together.
3085 pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>
3086 where
3087 F: Fn(DataFrame, Args) -> PolarsResult<B>,
3088 {
3089 f(self, args)
3090 }
3091
3092 /// Drop duplicate rows from a [`DataFrame`].
3093 /// *This fails when there is a column of type List in DataFrame*
3094 ///
3095 /// Stable means that the order is maintained. This has a higher cost than an unstable distinct.
3096 ///
3097 /// # Example
3098 ///
3099 /// ```no_run
3100 /// # use polars_core::prelude::*;
3101 /// let df = df! {
3102 /// "flt" => [1., 1., 2., 2., 3., 3.],
3103 /// "int" => [1, 1, 2, 2, 3, 3, ],
3104 /// "str" => ["a", "a", "b", "b", "c", "c"]
3105 /// }?;
3106 ///
3107 /// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);
3108 /// # Ok::<(), PolarsError>(())
3109 /// ```
3110 /// Returns
3111 ///
3112 /// ```text
3113 /// +-----+-----+-----+
3114 /// | flt | int | str |
3115 /// | --- | --- | --- |
3116 /// | f64 | i32 | str |
3117 /// +=====+=====+=====+
3118 /// | 1 | 1 | "a" |
3119 /// +-----+-----+-----+
3120 /// | 2 | 2 | "b" |
3121 /// +-----+-----+-----+
3122 /// | 3 | 3 | "c" |
3123 /// +-----+-----+-----+
3124 /// ```
3125 #[cfg(feature = "algorithm_group_by")]
3126 pub fn unique_stable(
3127 &self,
3128 subset: Option<&[String]>,
3129 keep: UniqueKeepStrategy,
3130 slice: Option<(i64, usize)>,
3131 ) -> PolarsResult<DataFrame> {
3132 self.unique_impl(
3133 true,
3134 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3135 keep,
3136 slice,
3137 )
3138 }
3139
3140 /// Unstable distinct. See [`DataFrame::unique_stable`].
3141 #[cfg(feature = "algorithm_group_by")]
3142 pub fn unique<I, S>(
3143 &self,
3144 subset: Option<&[String]>,
3145 keep: UniqueKeepStrategy,
3146 slice: Option<(i64, usize)>,
3147 ) -> PolarsResult<DataFrame> {
3148 self.unique_impl(
3149 false,
3150 subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),
3151 keep,
3152 slice,
3153 )
3154 }
3155
3156 #[cfg(feature = "algorithm_group_by")]
3157 pub fn unique_impl(
3158 &self,
3159 maintain_order: bool,
3160 subset: Option<Vec<PlSmallStr>>,
3161 keep: UniqueKeepStrategy,
3162 slice: Option<(i64, usize)>,
3163 ) -> PolarsResult<Self> {
3164 let names = subset.unwrap_or_else(|| self.get_column_names_owned());
3165 let mut df = self.clone();
3166 // take on multiple chunks is terrible
3167 df.as_single_chunk_par();
3168
3169 let columns = match (keep, maintain_order) {
3170 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {
3171 let gb = df.group_by_stable(names)?;
3172 let groups = gb.get_groups();
3173 let (offset, len) = slice.unwrap_or((0, groups.len()));
3174 let groups = groups.slice(offset, len);
3175 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3176 },
3177 (UniqueKeepStrategy::Last, true) => {
3178 // maintain order by last values, so the sorted groups are not correct as they
3179 // are sorted by the first value
3180 let gb = df.group_by_stable(names)?;
3181 let groups = gb.get_groups();
3182
3183 let last_idx: NoNull<IdxCa> = groups
3184 .iter()
3185 .map(|g| match g {
3186 GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],
3187 GroupsIndicator::Slice([first, len]) => first + len - 1,
3188 })
3189 .collect();
3190
3191 let mut last_idx = last_idx.into_inner().sort(false);
3192
3193 if let Some((offset, len)) = slice {
3194 last_idx = last_idx.slice(offset, len);
3195 }
3196
3197 let last_idx = NoNull::new(last_idx);
3198 let out = unsafe { df.take_unchecked(&last_idx) };
3199 return Ok(out);
3200 },
3201 (UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {
3202 let gb = df.group_by(names)?;
3203 let groups = gb.get_groups();
3204 let (offset, len) = slice.unwrap_or((0, groups.len()));
3205 let groups = groups.slice(offset, len);
3206 df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })
3207 },
3208 (UniqueKeepStrategy::Last, false) => {
3209 let gb = df.group_by(names)?;
3210 let groups = gb.get_groups();
3211 let (offset, len) = slice.unwrap_or((0, groups.len()));
3212 let groups = groups.slice(offset, len);
3213 df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })
3214 },
3215 (UniqueKeepStrategy::None, _) => {
3216 let df_part = df.select(names)?;
3217 let mask = df_part.is_unique()?;
3218 let mut filtered = df.filter(&mask)?;
3219
3220 if let Some((offset, len)) = slice {
3221 filtered = filtered.slice(offset, len);
3222 }
3223 return Ok(filtered);
3224 },
3225 };
3226 let height = Self::infer_height(&columns);
3227 Ok(unsafe { DataFrame::new_no_checks(height, columns) })
3228 }
3229
3230 /// Get a mask of all the unique rows in the [`DataFrame`].
3231 ///
3232 /// # Example
3233 ///
3234 /// ```no_run
3235 /// # use polars_core::prelude::*;
3236 /// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],
3237 /// "ISIN" => ["US0378331005", "US5949181045"])?;
3238 /// let ca: ChunkedArray<BooleanType> = df.is_unique()?;
3239 ///
3240 /// assert!(ca.all());
3241 /// # Ok::<(), PolarsError>(())
3242 /// ```
3243 #[cfg(feature = "algorithm_group_by")]
3244 pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {
3245 let gb = self.group_by(self.get_column_names_owned())?;
3246 let groups = gb.get_groups();
3247 Ok(is_unique_helper(
3248 groups,
3249 self.height() as IdxSize,
3250 true,
3251 false,
3252 ))
3253 }
3254
3255 /// Get a mask of all the duplicated rows in the [`DataFrame`].
3256 ///
3257 /// # Example
3258 ///
3259 /// ```no_run
3260 /// # use polars_core::prelude::*;
3261 /// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],
3262 /// "ISIN" => ["US02079K3059", "US02079K1079"])?;
3263 /// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;
3264 ///
3265 /// assert!(!ca.all());
3266 /// # Ok::<(), PolarsError>(())
3267 /// ```
3268 #[cfg(feature = "algorithm_group_by")]
3269 pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {
3270 let gb = self.group_by(self.get_column_names_owned())?;
3271 let groups = gb.get_groups();
3272 Ok(is_unique_helper(
3273 groups,
3274 self.height() as IdxSize,
3275 false,
3276 true,
3277 ))
3278 }
3279
3280 /// Create a new [`DataFrame`] that shows the null counts per column.
3281 #[must_use]
3282 pub fn null_count(&self) -> Self {
3283 let cols = self
3284 .columns
3285 .iter()
3286 .map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))
3287 .collect();
3288 unsafe { Self::new_no_checks(1, cols) }
3289 }
3290
3291 /// Hash and combine the row values
3292 #[cfg(feature = "row_hash")]
3293 pub fn hash_rows(
3294 &mut self,
3295 hasher_builder: Option<PlSeedableRandomStateQuality>,
3296 ) -> PolarsResult<UInt64Chunked> {
3297 let dfs = split_df(self, POOL.current_num_threads(), false);
3298 let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;
3299
3300 let mut iter = cas.into_iter();
3301 let mut acc_ca = iter.next().unwrap();
3302 for ca in iter {
3303 acc_ca.append(&ca)?;
3304 }
3305 Ok(acc_ca.rechunk().into_owned())
3306 }
3307
3308 /// Get the supertype of the columns in this DataFrame
3309 pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {
3310 self.columns
3311 .iter()
3312 .map(|s| Ok(s.dtype().clone()))
3313 .reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))
3314 }
3315
3316 /// Take by index values given by the slice `idx`.
3317 /// # Warning
3318 /// Be careful with allowing threads when calling this in a large hot loop
3319 /// every thread split may be on rayon stack and lead to SO
3320 #[doc(hidden)]
3321 pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {
3322 self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)
3323 }
3324
3325 /// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`
3326 /// if the index value in `idx` are sorted. This will maintain sorted flags.
3327 ///
3328 /// # Warning
3329 /// Be careful with allowing threads when calling this in a large hot loop
3330 /// every thread split may be on rayon stack and lead to SO
3331 #[doc(hidden)]
3332 pub unsafe fn _take_unchecked_slice_sorted(
3333 &self,
3334 idx: &[IdxSize],
3335 allow_threads: bool,
3336 sorted: IsSorted,
3337 ) -> Self {
3338 #[cfg(debug_assertions)]
3339 {
3340 if idx.len() > 2 {
3341 match sorted {
3342 IsSorted::Ascending => {
3343 assert!(idx[0] <= idx[idx.len() - 1]);
3344 },
3345 IsSorted::Descending => {
3346 assert!(idx[0] >= idx[idx.len() - 1]);
3347 },
3348 _ => {},
3349 }
3350 }
3351 }
3352 let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);
3353 ca.set_sorted_flag(sorted);
3354 self.take_unchecked_impl(&ca, allow_threads)
3355 }
3356
3357 #[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]
3358 #[doc(hidden)]
3359 pub fn _partition_by_impl(
3360 &self,
3361 cols: &[PlSmallStr],
3362 stable: bool,
3363 include_key: bool,
3364 parallel: bool,
3365 ) -> PolarsResult<Vec<DataFrame>> {
3366 let selected_keys = self.select_columns(cols.iter().cloned())?;
3367 let groups = self.group_by_with_series(selected_keys, parallel, stable)?;
3368 let groups = groups.into_groups();
3369
3370 // drop key columns prior to calculation if requested
3371 let df = if include_key {
3372 self.clone()
3373 } else {
3374 self.drop_many(cols.iter().cloned())
3375 };
3376
3377 if parallel {
3378 // don't parallelize this
3379 // there is a lot of parallelization in take and this may easily SO
3380 POOL.install(|| {
3381 match groups.as_ref() {
3382 GroupsType::Idx(idx) => {
3383 // Rechunk as the gather may rechunk for every group #17562.
3384 let mut df = df.clone();
3385 df.as_single_chunk_par();
3386 Ok(idx
3387 .into_par_iter()
3388 .map(|(_, group)| {
3389 // groups are in bounds
3390 unsafe {
3391 df._take_unchecked_slice_sorted(
3392 group,
3393 false,
3394 IsSorted::Ascending,
3395 )
3396 }
3397 })
3398 .collect())
3399 },
3400 GroupsType::Slice { groups, .. } => Ok(groups
3401 .into_par_iter()
3402 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3403 .collect()),
3404 }
3405 })
3406 } else {
3407 match groups.as_ref() {
3408 GroupsType::Idx(idx) => {
3409 // Rechunk as the gather may rechunk for every group #17562.
3410 let mut df = df;
3411 df.as_single_chunk();
3412 Ok(idx
3413 .into_iter()
3414 .map(|(_, group)| {
3415 // groups are in bounds
3416 unsafe {
3417 df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)
3418 }
3419 })
3420 .collect())
3421 },
3422 GroupsType::Slice { groups, .. } => Ok(groups
3423 .iter()
3424 .map(|[first, len]| df.slice(*first as i64, *len as usize))
3425 .collect()),
3426 }
3427 }
3428 }
3429
3430 /// Split into multiple DataFrames partitioned by groups
3431 #[cfg(feature = "partition_by")]
3432 pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>
3433 where
3434 I: IntoIterator<Item = S>,
3435 S: Into<PlSmallStr>,
3436 {
3437 let cols = cols
3438 .into_iter()
3439 .map(Into::into)
3440 .collect::<Vec<PlSmallStr>>();
3441 self._partition_by_impl(cols.as_slice(), false, include_key, true)
3442 }
3443
3444 /// Split into multiple DataFrames partitioned by groups
3445 /// Order of the groups are maintained.
3446 #[cfg(feature = "partition_by")]
3447 pub fn partition_by_stable<I, S>(
3448 &self,
3449 cols: I,
3450 include_key: bool,
3451 ) -> PolarsResult<Vec<DataFrame>>
3452 where
3453 I: IntoIterator<Item = S>,
3454 S: Into<PlSmallStr>,
3455 {
3456 let cols = cols
3457 .into_iter()
3458 .map(Into::into)
3459 .collect::<Vec<PlSmallStr>>();
3460 self._partition_by_impl(cols.as_slice(), true, include_key, true)
3461 }
3462
3463 /// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be
3464 /// inserted as columns.
3465 #[cfg(feature = "dtype-struct")]
3466 pub fn unnest<I: IntoVec<PlSmallStr>>(
3467 &self,
3468 cols: I,
3469 separator: Option<&str>,
3470 ) -> PolarsResult<DataFrame> {
3471 let cols = cols.into_vec();
3472 self.unnest_impl(cols.into_iter().collect(), separator)
3473 }
3474
3475 #[cfg(feature = "dtype-struct")]
3476 fn unnest_impl(
3477 &self,
3478 cols: PlHashSet<PlSmallStr>,
3479 separator: Option<&str>,
3480 ) -> PolarsResult<DataFrame> {
3481 let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));
3482 let mut count = 0;
3483 for s in &self.columns {
3484 if cols.contains(s.name()) {
3485 let ca = s.struct_()?.clone();
3486 new_cols.extend(ca.fields_as_series().into_iter().map(|mut f| {
3487 if let Some(separator) = &separator {
3488 f.rename(polars_utils::format_pl_smallstr!(
3489 "{}{}{}",
3490 s.name(),
3491 separator,
3492 f.name()
3493 ));
3494 }
3495 Column::from(f)
3496 }));
3497 count += 1;
3498 } else {
3499 new_cols.push(s.clone())
3500 }
3501 }
3502 if count != cols.len() {
3503 // one or more columns not found
3504 // the code below will return an error with the missing name
3505 let schema = self.schema();
3506 for col in cols {
3507 let _ = schema
3508 .get(col.as_str())
3509 .ok_or_else(|| polars_err!(col_not_found = col))?;
3510 }
3511 }
3512 DataFrame::new(new_cols)
3513 }
3514
3515 pub(crate) fn infer_height(cols: &[Column]) -> usize {
3516 cols.first().map_or(0, Column::len)
3517 }
3518
3519 pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {
3520 // @Optimize: this does a lot of unnecessary allocations. We should probably have a
3521 // append_chunk or something like this. It is just quite difficult to make that safe.
3522 let df = DataFrame::from(rb);
3523 polars_ensure!(
3524 self.schema() == df.schema(),
3525 SchemaMismatch: "cannot append record batch with different schema\n\n
3526 Got {:?}\nexpected: {:?}", df.schema(), self.schema(),
3527 );
3528 self.vstack_mut_owned_unchecked(df);
3529 Ok(())
3530 }
3531
3532 pub fn into_columns(self) -> Vec<Column> {
3533 self.columns
3534 }
3535}
3536
3537pub struct RecordBatchIter<'a> {
3538 columns: &'a Vec<Column>,
3539 schema: ArrowSchemaRef,
3540 idx: usize,
3541 n_chunks: usize,
3542 compat_level: CompatLevel,
3543 parallel: bool,
3544}
3545
3546impl Iterator for RecordBatchIter<'_> {
3547 type Item = RecordBatch;
3548
3549 fn next(&mut self) -> Option<Self::Item> {
3550 if self.idx >= self.n_chunks {
3551 return None;
3552 }
3553
3554 // Create a batch of the columns with the same chunk no.
3555 let batch_cols: Vec<ArrayRef> = if self.parallel {
3556 let iter = self
3557 .columns
3558 .par_iter()
3559 .map(Column::as_materialized_series)
3560 .map(|s| s.to_arrow(self.idx, self.compat_level));
3561 POOL.install(|| iter.collect())
3562 } else {
3563 self.columns
3564 .iter()
3565 .map(Column::as_materialized_series)
3566 .map(|s| s.to_arrow(self.idx, self.compat_level))
3567 .collect()
3568 };
3569 self.idx += 1;
3570
3571 let length = batch_cols.first().map_or(0, |arr| arr.len());
3572 Some(RecordBatch::new(length, self.schema.clone(), batch_cols))
3573 }
3574
3575 fn size_hint(&self) -> (usize, Option<usize>) {
3576 let n = self.n_chunks - self.idx;
3577 (n, Some(n))
3578 }
3579}
3580
3581pub struct PhysRecordBatchIter<'a> {
3582 schema: ArrowSchemaRef,
3583 arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,
3584}
3585
3586impl Iterator for PhysRecordBatchIter<'_> {
3587 type Item = RecordBatch;
3588
3589 fn next(&mut self) -> Option<Self::Item> {
3590 let arrs = self
3591 .arr_iters
3592 .iter_mut()
3593 .map(|phys_iter| phys_iter.next().cloned())
3594 .collect::<Option<Vec<_>>>()?;
3595
3596 let length = arrs.first().map_or(0, |arr| arr.len());
3597 Some(RecordBatch::new(length, self.schema.clone(), arrs))
3598 }
3599
3600 fn size_hint(&self) -> (usize, Option<usize>) {
3601 if let Some(iter) = self.arr_iters.first() {
3602 iter.size_hint()
3603 } else {
3604 (0, None)
3605 }
3606 }
3607}
3608
3609impl Default for DataFrame {
3610 fn default() -> Self {
3611 DataFrame::empty()
3612 }
3613}
3614
3615impl From<DataFrame> for Vec<Column> {
3616 fn from(df: DataFrame) -> Self {
3617 df.columns
3618 }
3619}
3620
3621// utility to test if we can vstack/extend the columns
3622fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {
3623 polars_ensure!(
3624 left.name() == right.name(),
3625 ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
3626 left.name(), right.name(),
3627 );
3628 Ok(())
3629}
3630
3631#[cfg(test)]
3632mod test {
3633 use super::*;
3634
3635 fn create_frame() -> DataFrame {
3636 let s0 = Column::new("days".into(), [0, 1, 2].as_ref());
3637 let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());
3638 DataFrame::new(vec![s0, s1]).unwrap()
3639 }
3640
3641 #[test]
3642 #[cfg_attr(miri, ignore)]
3643 fn test_recordbatch_iterator() {
3644 let df = df!(
3645 "foo" => [1, 2, 3, 4, 5]
3646 )
3647 .unwrap();
3648 let mut iter = df.iter_chunks(CompatLevel::newest(), false);
3649 assert_eq!(5, iter.next().unwrap().len());
3650 assert!(iter.next().is_none());
3651 }
3652
3653 #[test]
3654 #[cfg_attr(miri, ignore)]
3655 fn test_select() {
3656 let df = create_frame();
3657 assert_eq!(
3658 df.column("days")
3659 .unwrap()
3660 .as_series()
3661 .unwrap()
3662 .equal(1)
3663 .unwrap()
3664 .sum(),
3665 Some(1)
3666 );
3667 }
3668
3669 #[test]
3670 #[cfg_attr(miri, ignore)]
3671 fn test_filter_broadcast_on_string_col() {
3672 let col_name = "some_col";
3673 let v = vec!["test".to_string()];
3674 let s0 = Column::new(PlSmallStr::from_str(col_name), v);
3675 let mut df = DataFrame::new(vec![s0]).unwrap();
3676
3677 df = df
3678 .filter(
3679 &df.column(col_name)
3680 .unwrap()
3681 .as_materialized_series()
3682 .equal("")
3683 .unwrap(),
3684 )
3685 .unwrap();
3686 assert_eq!(
3687 df.column(col_name)
3688 .unwrap()
3689 .as_materialized_series()
3690 .n_chunks(),
3691 1
3692 );
3693 }
3694
3695 #[test]
3696 #[cfg_attr(miri, ignore)]
3697 fn test_filter_broadcast_on_list_col() {
3698 let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);
3699 let ll: ListChunked = [&s1].iter().copied().collect();
3700
3701 let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);
3702 let new = ll.filter(&mask).unwrap();
3703
3704 assert_eq!(new.chunks.len(), 1);
3705 assert_eq!(new.len(), 0);
3706 }
3707
3708 #[test]
3709 fn slice() {
3710 let df = create_frame();
3711 let sliced_df = df.slice(0, 2);
3712 assert_eq!(sliced_df.shape(), (2, 2));
3713 }
3714
3715 #[test]
3716 fn rechunk_false() {
3717 let df = create_frame();
3718 assert!(!df.should_rechunk())
3719 }
3720
3721 #[test]
3722 fn rechunk_true() -> PolarsResult<()> {
3723 let mut base = df!(
3724 "a" => [1, 2, 3],
3725 "b" => [1, 2, 3]
3726 )?;
3727
3728 // Create a series with multiple chunks
3729 let mut s = Series::new("foo".into(), 0..2);
3730 let s2 = Series::new("bar".into(), 0..1);
3731 s.append(&s2)?;
3732
3733 // Append series to frame
3734 let out = base.with_column(s)?;
3735
3736 // Now we should rechunk
3737 assert!(out.should_rechunk());
3738 Ok(())
3739 }
3740
3741 #[test]
3742 fn test_duplicate_column() {
3743 let mut df = df! {
3744 "foo" => [1, 2, 3]
3745 }
3746 .unwrap();
3747 // check if column is replaced
3748 assert!(
3749 df.with_column(Series::new("foo".into(), &[1, 2, 3]))
3750 .is_ok()
3751 );
3752 assert!(
3753 df.with_column(Series::new("bar".into(), &[1, 2, 3]))
3754 .is_ok()
3755 );
3756 assert!(df.column("bar").is_ok())
3757 }
3758
3759 #[test]
3760 #[cfg_attr(miri, ignore)]
3761 fn distinct() {
3762 let df = df! {
3763 "flt" => [1., 1., 2., 2., 3., 3.],
3764 "int" => [1, 1, 2, 2, 3, 3, ],
3765 "str" => ["a", "a", "b", "b", "c", "c"]
3766 }
3767 .unwrap();
3768 let df = df
3769 .unique_stable(None, UniqueKeepStrategy::First, None)
3770 .unwrap()
3771 .sort(["flt"], SortMultipleOptions::default())
3772 .unwrap();
3773 let valid = df! {
3774 "flt" => [1., 2., 3.],
3775 "int" => [1, 2, 3],
3776 "str" => ["a", "b", "c"]
3777 }
3778 .unwrap();
3779 assert!(df.equals(&valid));
3780 }
3781
3782 #[test]
3783 fn test_vstack() {
3784 // check that it does not accidentally rechunks
3785 let mut df = df! {
3786 "flt" => [1., 1., 2., 2., 3., 3.],
3787 "int" => [1, 1, 2, 2, 3, 3, ],
3788 "str" => ["a", "a", "b", "b", "c", "c"]
3789 }
3790 .unwrap();
3791
3792 df.vstack_mut(&df.slice(0, 3)).unwrap();
3793 assert_eq!(df.first_col_n_chunks(), 2)
3794 }
3795
3796 #[test]
3797 fn test_vstack_on_empty_dataframe() {
3798 let mut df = DataFrame::empty();
3799
3800 let df_data = df! {
3801 "flt" => [1., 1., 2., 2., 3., 3.],
3802 "int" => [1, 1, 2, 2, 3, 3, ],
3803 "str" => ["a", "a", "b", "b", "c", "c"]
3804 }
3805 .unwrap();
3806
3807 df.vstack_mut(&df_data).unwrap();
3808 assert_eq!(df.height, 6)
3809 }
3810
3811 #[test]
3812 fn test_replace_or_add() -> PolarsResult<()> {
3813 let mut df = df!(
3814 "a" => [1, 2, 3],
3815 "b" => [1, 2, 3]
3816 )?;
3817
3818 // check that the new column is "c" and not "bar".
3819 df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;
3820
3821 assert_eq!(df.get_column_names(), &["a", "b", "c"]);
3822 Ok(())
3823 }
3824
3825 #[test]
3826 fn test_unique_keep_none_with_slice() {
3827 let df = df! {
3828 "x" => [1, 2, 3, 2, 1]
3829 }
3830 .unwrap();
3831 let out = df
3832 .unique_stable(
3833 Some(&["x".to_string()][..]),
3834 UniqueKeepStrategy::None,
3835 Some((0, 2)),
3836 )
3837 .unwrap();
3838 let expected = df! {
3839 "x" => [3]
3840 }
3841 .unwrap();
3842 assert!(out.equals(&expected));
3843 }
3844
3845 #[test]
3846 #[cfg(feature = "dtype-i8")]
3847 fn test_apply_result_schema() {
3848 let mut df = df! {
3849 "x" => [1, 2, 3, 2, 1]
3850 }
3851 .unwrap();
3852
3853 let schema_before = df.schema().clone();
3854 df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();
3855 assert_ne!(&schema_before, df.schema());
3856 }
3857}