|
| 1 | +.. Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +.. or more contributor license agreements. See the NOTICE file |
| 3 | +.. distributed with this work for additional information |
| 4 | +.. regarding copyright ownership. The ASF licenses this file |
| 5 | +.. to you under the Apache License, Version 2.0 (the |
| 6 | +.. "License"); you may not use this file except in compliance |
| 7 | +.. with the License. You may obtain a copy of the License at |
| 8 | +
|
| 9 | +.. http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +
|
| 11 | +.. Unless required by applicable law or agreed to in writing, |
| 12 | +.. software distributed under the License is distributed on an |
| 13 | +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +.. KIND, either express or implied. See the License for the |
| 15 | +.. specific language governing permissions and limitations |
| 16 | +.. under the License. |
| 17 | +
|
| 18 | +DataFrame Operations |
| 19 | +=================== |
| 20 | + |
| 21 | +Working with DataFrames |
| 22 | +---------------------- |
| 23 | + |
| 24 | +A DataFrame in DataFusion represents a logical plan that defines a series of operations to be performed on data. |
| 25 | +This logical plan is not executed until you call a terminal operation like :py:func:`~datafusion.dataframe.DataFrame.collect` |
| 26 | +or :py:func:`~datafusion.dataframe.DataFrame.show`. |
| 27 | + |
| 28 | +DataFrames provide a familiar API for data manipulation: |
| 29 | + |
| 30 | +.. ipython:: python |
| 31 | +
|
| 32 | + import datafusion |
| 33 | + from datafusion import col, lit, functions as f |
| 34 | + |
| 35 | + ctx = datafusion.SessionContext() |
| 36 | + |
| 37 | + # Create a DataFrame from a CSV file |
| 38 | + df = ctx.read_csv("example.csv") |
| 39 | + |
| 40 | + # Add transformations |
| 41 | + df = df.filter(col("age") > lit(30)) \ |
| 42 | + .select([col("name"), col("age"), (col("salary") * lit(1.1)).alias("new_salary")]) \ |
| 43 | + .sort("age") |
<
9920
/td> | 44 | + |
| 45 | + # Execute the plan |
| 46 | + df.show() |
| 47 | +
|
| 48 | +Common DataFrame Operations |
| 49 | +-------------------------- |
| 50 | + |
| 51 | +DataFusion supports a wide range of operations on DataFrames: |
| 52 | + |
| 53 | +Filtering and Selection |
| 54 | +~~~~~~~~~~~~~~~~~~~~~~~ |
| 55 | + |
| 56 | +.. ipython:: python |
| 57 | +
|
| 58 | + # Filter rows |
| 59 | + df = df.filter(col("age") > lit(30)) |
| 60 | + |
| 61 | + # Select columns |
| 62 | + df = df.select([col("name"), col("age")]) |
| 63 | + |
| 64 | + # Select by column name |
| 65 | + df = df.select_columns(["name", "age"]) |
| 66 | + |
| 67 | + # Select using column indexing |
| 68 | + df = df["name", "age"] |
| 69 | +
|
| 70 | +Aggregation |
| 71 | +~~~~~~~~~~ |
| 72 | + |
| 73 | +.. ipython:: python |
| 74 | +
|
| 75 | + # Group by and aggregate |
| 76 | + df = df.aggregate( |
| 77 | + [col("category")], # Group by columns |
| 78 | + [f.sum(col("amount")).alias("total"), |
| 79 | + f.avg(col("price")).alias("avg_price")] |
| 80 | + ) |
| 81 | +
|
| 82 | +Joins |
| 83 | +~~~~~ |
| 84 | + |
| 85 | +.. ipython:: python |
| 86 | +
|
| 87 | + # Join two DataFrames |
| 88 | + df_joined = df1.join( |
| 89 | + df2, |
| 90 | + how="inner", |
| 91 | + left_on=["id"], |
| 92 | + right_on=["id"] |
| 93 | + ) |
| 94 | + |
| 95 | + # Join with custom expressions |
| 96 | + df_joined = df1.join_on( |
| 97 | + df2, |
| 98 | + [col("df1.id") == col("df2.id")], |
| 99 | + how="left" |
| 100 | + ) |
| 101 | +
|
| 102 | +DataFrame Visualization |
| 103 | +---------------------- |
| 104 | + |
| 105 | +Jupyter Notebook Integration |
| 106 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 107 | + |
| 108 | +When working in Jupyter notebooks, DataFrames automatically display as HTML tables. This is |
| 109 | +handled by the :code:`_repr_html_` method, which provides a rich, formatted view of your data. |
| 110 | + |
| 111 | +.. ipython:: python |
| 112 | +
|
| 113 | + # DataFrames render as HTML tables in notebooks |
| 114 | + df # Just displaying the DataFrame renders it as HTML |
| 115 | +
|
| 116 | +Customizing DataFrame Display |
| 117 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 118 | + |
| 119 | +You can customize how DataFrames are displayed using the HTML formatter: |
| 120 | + |
| 121 | +.. ipython:: python |
| 122 | +
|
| 123 | + from datafusion.html_formatter import configure_formatter |
| 124 | + |
| 125 | + # Change display settings |
| 126 | + configure_formatter( |
| 127 | + max_rows=100, # Show more rows |
| 128 | + truncate_width=30, # Allow longer strings |
| 129 | + theme="light", # Use light theme |
| 130 | + precision=2 # Set decimal precision |
| 131 | + ) |
| 132 | + |
| 133 | + # Now display uses the new format |
| 134 | + df.show() |
| 135 | +
|
| 136 | +Creating a Custom Style Provider |
| 137 | +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 138 | + |
| 139 | +For advanced styling needs: |
| 140 | + |
| 141 | +.. code-block:: python |
| 142 | +
|
| 143 | + from datafusion.html_formatter import StyleProvider, configure_formatter |
| 144 | + |
| 145 | + class CustomStyleProvider(StyleProvider): |
| 146 | + def get_table_styles(self): |
| 147 | + return { |
| 148 | + "table": "border-collapse: collapse; width: 100%;", |
| 149 | + "th": "background-color: #4CAF50; color: white; padding: 10px;", |
| 150 | + "td": "border: 1px solid #ddd; padding: 8px;", |
| 151 | + "tr:hover": "background-color: #f5f5f5;", |
| 152 | + } |
| 153 | + |
| 154 | + def get_value_styles(self, dtype, value): |
| 155 | + if dtype == "float" and value < 0: |
| 156 | + return "color: red; font-weight: bold;" |
| 157 | + return None |
| 158 | + |
| 159 | + # Apply custom styling |
| 160 | + configure_formatter(style_provider=CustomStyleProvider()) |
| 161 | +
|
| 162 | +Managing Display Settings |
| 163 | +~~~~~~~~~~~~~~~~~~~~~~~ |
| 164 | + |
| 165 | +You can temporarily change formatting settings with context managers: |
| 166 | + |
| 167 | +.. code-block:: python |
| 168 | +
|
| 169 | + from datafusion.html_formatter import formatting_context |
| 170 | + |
| 171 | + # Use different formatting temporarily |
| 172 | + with formatting_context(max_rows=5, theme="dark"): |
| 173 | + df.show() # Will show only 5 rows with dark theme |
| 174 | + |
| 175 | + # Reset to default formatting |
| 176 | + from datafusion.html_formatter import reset_formatter |
| 177 | + reset_formatter() |
| 178 | +
|
| 179 | +Converting to Other Formats |
| 180 | +-------------------------- |
| 181 | + |
| 182 | +DataFusion DataFrames can be easily converted to other popular formats: |
| 183 | + |
| 184 | +.. ipython:: python |
| 185 | +
|
| 186 | + # Convert to Arrow Table |
| 187 | + arrow_table = df.to_arrow_table() |
| 188 | + |
| 189 | + # Convert to Pandas DataFrame |
| 190 | + pandas_df = df.to_pandas() |
| 191 | + |
| 192 | + # Convert to Polars DataFrame |
| 193 | + polars_df = df.to_polars() |
| 194 | + |
| 195 | + # Convert to Python data structures |
| 196 | + python_dict = df.to_pydict() |
| 197 | + python_list = df.to_pylist() |
| 198 | +
|
| 199 | +Saving DataFrames |
| 200 | +--------------- |
| 201 | + |
| 202 | +You can write DataFrames to various file formats: |
| 203 | + |
| 204 | +.. ipython:: python |
| 205 | +
|
| 206 | + # Write to CSV |
| 207 | + df.write_csv("output.csv", with_header=True) |
| 208 | + |
| 209 | + # Write to Parquet |
| 210 | + df.write_parquet("output.parquet", compression="zstd") |
| 211 | + |
| 212 | + # Write to JSON |
| 213 | + df.write_json("output.json") |
0 commit comments