0% found this document useful (0 votes)

21 views3 pages

Pyspark SQL Transformation Cheat Sheet

Uploaded by

Sujith

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

21 views3 pages

Pyspark SQL Transformation Cheat Sheet

Uploaded by

Sujith

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 3

SQL vs PySpark – Transformations Cheat

Sheet
Professional, friendly, and compact reference for everyday data engineering tasks. Save & share. ■

Column Operations & Conditionals

Topic SQL PySpark Notes

SELECT *, salary*1.1 AS df = df.withColumn('new_salary',
withColumn Create/replace
new_salary FROM emp F.col('salary')*1.1)
column

withColumnRe SELECT emp_id AS id FROM emp df = df.withColumnRenamed('emp_i Single-column

named d','id') rename

select SELECT id, name FROM emp df2 = df.select('id','name') Projection

filter/where SELECT * FROM emp WHERE df.filter(F.col('dept')=='IT') Row filter

dept='IT'

when/otherwis SELECT CASE WHEN gender='m' THEN df.withColumn('g', F.when(F.col( Conditional

e 'male' WHEN gender='f' THEN 'gender')=='m','male').when(F.co
'female' ELSE 'unknown' END AS g l('gender')=='f','female').other
FROM emp wise('unknown'))

Joins

Topic SQL PySpark Notes

SELECT e.* , d.dept_name FROM e.join(d, e.dept_id==d.id,
Inner Join Default join
emp e JOIN dept d ON 'inner')
e.dept_id=d.id

Left Join SELECT e.*, d.dept_name FROM emp e.join(d, e.dept_id==d.id, Keep all left rows
e LEFT JOIN dept d ON 'left')
e.dept_id=d.id

Right Join SELECT e.*, d.dept_name FROM emp e.join(d, e.dept_id==d.id, Keep all right rows
e RIGHT JOIN dept d ON 'right')
e.dept_id=d.id

Full Join SELECT * FROM emp e FULL OUTER e.join(d, e.dept_id==d.id, All rows from both
JOIN dept d ON e.dept_id=d.id 'outer')

Cross Join SELECT * FROM emp CROSS JOIN e.crossJoin(d) Cartesian product
dept

Self Join SELECT e.name, m.name AS manager e.alias('e').join(e.alias('m'), Emp ↔Manager

FROM emp e LEFT JOIN emp m ON F.col('e.manager_id')==F.col('m.
e.manager_id=m.id id'),
'left').select(F.col('e.name'),
F.col('m.name').alias('manager')
)
Special Data Types

Topic SQL PySpark Notes

-- Nested columns depend on from pyspark.sql.types import Nested record
StructType engine -- Example in Spark SQL StructType, StructField,
SELECT named_struct('city', StringType schema =
city, 'zip', zip) AS addr FROM t StructType([StructField('city',
StringType()),
StructField('zip',
StringType())])

ArrayType SELECT array('a','b','c') AS arr from pyspark.sql.types import Ordered list

ArrayType, StringType
ArrayType(StringType())

MapType SELECT map('k1','v1','k2','v2') from pyspark.sql.types import Key-value

AS m MapType MapType(StringType(),
StringType())

Aggregations

Topic SQL PySpark Notes

SELECT dept, COUNT(*) AS cnt df.groupBy('dept').count() Basic group
groupBy FROM emp GROUP BY dept

groupBy agg SELECT dept, AVG(salary) AS df.groupBy('dept').agg(F.avg('sa Multiple aggs

avg_sal, MAX(salary) AS max_sal lary').alias('avg_sal'), F.max('
FROM emp GROUP BY dept salary').alias('max_sal'))

distinct SELECT DISTINCT dept FROM emp df.select('dept').distinct() Unique rows

dropDuplicates SELECT DISTINCT * FROM emp -- df.dropDuplicates(['id']) # or Distinct subset

(conceptual) no cols for all

Unions & Set Operations

Topic SQL PySpark Notes

SELECT * FROM t1 UNION ALL t1.union(t2) Union w/ duplicates
UNION ALL SELECT * FROM t2

UNION SELECT * FROM t1 UNION SELECT * t1.union(t2).distinct() Remove duplicates

(distinct) FROM t2

unionByName -- N/A (engine-specific) t1.unionByName(t2, Match by column

allowMissingColumns=True) name

Null Handling

Topic SQL PySpark Notes

dropna SELECT * FROM t WHERE col IS NOT df.dropna(subset=['col']) # or Drop nulls
NULL how='all'/'any'

fillna SELECT COALESCE(col, 0) AS col df.fillna({'col':0}) Replace nulls

FROM t

Pivot & Unpivot

Topic SQL PySpark Notes

-- Many SQL engines SELECT dept, df.groupBy('dept').pivot('gender Wide from long
Pivot SUM(CASE WHEN gender='M' THEN 1 ',['M','F']).count()
ELSE 0 END) AS M, SUM(CASE WHEN
gender='F' THEN 1 ELSE 0 END) AS
F FROM emp GROUP BY dept

Unpivot -- UNPIVOT or CROSS APPLY VALUES df.selectExpr('dept', "stack(2, Long from wide (via
(engine-specific) 'M', M, 'F', F) as (gender, stack)
cnt)")

Higher-Order Functions & Transform

Topic SQL PySpark Notes

SELECT transform(arr, x -> x + df.select(F.expr('transform(arr, Element-wise
transform 1) AS arr2 FROM t x -> x + 1)').alias('arr2'))
(arrays)

filter (arrays) SELECT filter(arr, x -> x > 0) df.select(F.expr('filter(arr, x Predicate on arrays

AS arr2 FROM t -> x > 0)').alias('arr2'))

aggregate SELECT aggregate(arr, 0, (acc, df.select(F.expr('aggregate(arr, Fold/reduce

(arrays) x) -> acc + x) AS s FROM t 0, (acc, x) -> acc +
x)').alias('s'))

Extras & Tips

Topic SQL PySpark Notes

-- Engine-specific df.explain(True) View logical/physical
explain plan

repartition/coal -- N/A df.repartition(8); Shuffle vs no-shuffle

esce df.coalesce(2)

write -- N/A df.write.partitionBy('year','mon Layout for pruning

partitionBy th').parquet(path)

Comparison of SQL
No ratings yet
Comparison of SQL
11 pages
Pyspark Syntax Using Simple Examples
No ratings yet
Pyspark Syntax Using Simple Examples
28 pages
SQL Vs PySpark 1678871778
No ratings yet
SQL Vs PySpark 1678871778
8 pages
SQL & pySPARK
No ratings yet
SQL & pySPARK
9 pages
SQL Vs Pyspark-1
No ratings yet
SQL Vs Pyspark-1
9 pages
SQL & Pyspark
No ratings yet
SQL & Pyspark
9 pages
SQL PySpark Cheat Sheet 1731729790
No ratings yet
SQL PySpark Cheat Sheet 1731729790
9 pages
SQL To Pyspark Conversion
No ratings yet
SQL To Pyspark Conversion
9 pages
SQL - & - Pyspak
No ratings yet
SQL - & - Pyspak
6 pages
PySpark Cheatsheet - Elaborate
No ratings yet
PySpark Cheatsheet - Elaborate
14 pages
SQL & PySpark ?
No ratings yet
SQL & PySpark ?
9 pages
SQL & PySpark for Data Engineers
No ratings yet
SQL & PySpark for Data Engineers
58 pages
Pyspark - Cheatsheet With Comparison To SQL5 - Seequality
No ratings yet
Pyspark - Cheatsheet With Comparison To SQL5 - Seequality
36 pages
Data and AI - Spark Python
No ratings yet
Data and AI - Spark Python
11 pages
Azure Code
No ratings yet
Azure Code
2 pages
Cheat Sheet: From Spark Data Sources SQL Queries
No ratings yet
Cheat Sheet: From Spark Data Sources SQL Queries
1 page
PySpark SQL Cheat Sheet Guide
No ratings yet
PySpark SQL Cheat Sheet Guide
1 page
How To Work With Apache Spark and Delta Lake?
No ratings yet
How To Work With Apache Spark and Delta Lake?
40 pages
SQL Vs PySpark
No ratings yet
SQL Vs PySpark
7 pages
PySpark All Query
No ratings yet
PySpark All Query
22 pages
PySpark SQL Pandas CheatSheet
No ratings yet
PySpark SQL Pandas CheatSheet
2 pages
PySpark Data Frame Questions PDF
100% (2)
PySpark Data Frame Questions PDF
57 pages
Pyspark SQL Basics Cheat Sheet: Python For Data Science
No ratings yet
Pyspark SQL Basics Cheat Sheet: Python For Data Science
1 page
Pyspark IQ FREE Guide
100% (1)
Pyspark IQ FREE Guide
57 pages
PySpark DataFrame Operations
No ratings yet
PySpark DataFrame Operations
103 pages
PySpark SQL Cheat Sheet Python PDF
No ratings yet
PySpark SQL Cheat Sheet Python PDF
1 page
PySpark SQL Cheat Sheet Python
100% (2)
PySpark SQL Cheat Sheet Python
1 page
PySpark SQL Cheat Sheet Python PDF
No ratings yet
PySpark SQL Cheat Sheet Python PDF
1 page
PySpark SQL Cheat Sheet Python
No ratings yet
PySpark SQL Cheat Sheet Python
1 page
Pyspark Distinct and Filter
No ratings yet
Pyspark Distinct and Filter
3 pages
SQL Cheat Sheet Python
100% (1)
SQL Cheat Sheet Python
1 page
Solutions 1742312993
No ratings yet
Solutions 1742312993
14 pages
Scenarios Where Bad Records Occur
No ratings yet
Scenarios Where Bad Records Occur
38 pages
SQL and PySpark
No ratings yet
SQL and PySpark
80 pages
SQL Joins Styled PySpark
No ratings yet
SQL Joins Styled PySpark
7 pages
Pyspark Intro
No ratings yet
Pyspark Intro
3 pages
Sanya Sekhri Assignment
No ratings yet
Sanya Sekhri Assignment
2 pages
PySpark DataFrame Merging Guide
No ratings yet
PySpark DataFrame Merging Guide
42 pages
Databricks Vs SQL Cheat Sheet
100% (2)
Databricks Vs SQL Cheat Sheet
11 pages
Pyspark Cheat Sheet
No ratings yet
Pyspark Cheat Sheet
4 pages
HTML Code
No ratings yet
HTML Code
3 pages
PySpark Transformations
No ratings yet
PySpark Transformations
18 pages
PySpark Big Data Analytics Guide
No ratings yet
PySpark Big Data Analytics Guide
7 pages
Spark Entity Resolution with DataFrame Analysis
No ratings yet
Spark Entity Resolution with DataFrame Analysis
5 pages
Big Data Analytics in Apache Spark
No ratings yet
Big Data Analytics in Apache Spark
79 pages
Spark SQL Optimization - Real Case Studies
No ratings yet
Spark SQL Optimization - Real Case Studies
18 pages
Pyspark Module 1
No ratings yet
Pyspark Module 1
63 pages
DP 700 Code Used 250701
No ratings yet
DP 700 Code Used 250701
47 pages
Spark SQL
No ratings yet
Spark SQL
41 pages
Pyspark Basics
No ratings yet
Pyspark Basics
16 pages
Journal
No ratings yet
Journal
47 pages
Unit 4 Spark SQL
No ratings yet
Unit 4 Spark SQL
49 pages
Methods & Function in Databricks
No ratings yet
Methods & Function in Databricks
34 pages
PySpark StructType StructField Explained 1722792510
No ratings yet
PySpark StructType StructField Explained 1722792510
6 pages
s-22 DWM
100% (2)
s-22 DWM
33 pages
Hibernate Search Reference
No ratings yet
Hibernate Search Reference
379 pages
Cs403-Finalterm Solved Mcqs With References by Moaaz
90% (10)
Cs403-Finalterm Solved Mcqs With References by Moaaz
39 pages
How To Execute The Reoprt and Download The File
No ratings yet
How To Execute The Reoprt and Download The File
4 pages
SAP HANA Disk Size Analysis Tool
No ratings yet
SAP HANA Disk Size Analysis Tool
3 pages
2015 Course Outline CompTIA Storage Powered by SNIAv3
No ratings yet
2015 Course Outline CompTIA Storage Powered by SNIAv3
6 pages
Database Environment
100% (1)
Database Environment
20 pages
Getting Started With The ArcHydro Data Model Template
No ratings yet
Getting Started With The ArcHydro Data Model Template
4 pages
BDA Cie 2 Answers
No ratings yet
BDA Cie 2 Answers
15 pages
Audit Trail Config
No ratings yet
Audit Trail Config
3 pages
SK Voters List: San Jose, Lupi
No ratings yet
SK Voters List: San Jose, Lupi
6 pages
Auditing Accounts Receivable Systems
No ratings yet
Auditing Accounts Receivable Systems
32 pages
Creating Semantically Partitioned Object (SPO) in BW 7
No ratings yet
Creating Semantically Partitioned Object (SPO) in BW 7
26 pages
How To Remove Autorun Virus Using CMD
No ratings yet
How To Remove Autorun Virus Using CMD
17 pages
Backup and Recovery Best Practices With Veeam Backup and Replication
No ratings yet
Backup and Recovery Best Practices With Veeam Backup and Replication
25 pages
SIP Sorcery Configuration Guide
No ratings yet
SIP Sorcery Configuration Guide
11 pages
MIS Solutions for KKC Managers
No ratings yet
MIS Solutions for KKC Managers
30 pages
Final LP-VI Lab Manual 23-24
No ratings yet
Final LP-VI Lab Manual 23-24
71 pages
Big Data Principles and Best Practices of Scalable Realtime Data Systems 1st Edition Nathan Marz All Chapters Available
No ratings yet
Big Data Principles and Best Practices of Scalable Realtime Data Systems 1st Edition Nathan Marz All Chapters Available
97 pages
Analisis Kesalahan Siswa Dalam Menyelesaikan Soal Cerita Matematika Berdasarkan Analisis Kesalahan Newman Pada Siswa Kelas VIII SMP Negeri 7 Padang
No ratings yet
Analisis Kesalahan Siswa Dalam Menyelesaikan Soal Cerita Matematika Berdasarkan Analisis Kesalahan Newman Pada Siswa Kelas VIII SMP Negeri 7 Padang
6 pages
Multiway and B-Trees Overview
No ratings yet
Multiway and B-Trees Overview
22 pages
HP-UX Bootable DVD Creation Guide
No ratings yet
HP-UX Bootable DVD Creation Guide
8 pages
SAP Library - ABAP Programming (BC-ABA)
No ratings yet
SAP Library - ABAP Programming (BC-ABA)
1 page
Accomplishment Report: Period Covered: - October 1-15, 2020
100% (1)
Accomplishment Report: Period Covered: - October 1-15, 2020
2 pages
The Basic of SQL + List of Courses
No ratings yet
The Basic of SQL + List of Courses
29 pages
C ADO - NET. Building Secure and Scalable Data Access 2023 (Theophilus Edet)
No ratings yet
C ADO - NET. Building Secure and Scalable Data Access 2023 (Theophilus Edet)
275 pages
No SQL
No ratings yet
No SQL
21 pages
Azure Projects for Beginners
No ratings yet
Azure Projects for Beginners
8 pages
Azure Storage PDF
No ratings yet
Azure Storage PDF
1,305 pages
StretchDB 1.0
No ratings yet
StretchDB 1.0
4 pages

Pyspark SQL Transformation Cheat Sheet

Uploaded by

Pyspark SQL Transformation Cheat Sheet

Uploaded by

SQL vs PySpark – Transformations Cheat

Column Operations & Conditionals

Topic SQL PySpark Notes

withColumnRe SELECT emp_id AS id FROM emp df = df.withColumnRenamed('emp_i Single-column

select SELECT id, name FROM emp df2 = df.select('id','name') Projection

filter/where SELECT * FROM emp WHERE df.filter(F.col('dept')=='IT') Row filter

when/otherwis SELECT CASE WHEN gender='m' THEN df.withColumn('g', F.when(F.col( Conditional

Topic SQL PySpark Notes

Self Join SELECT e.name, m.name AS manager e.alias('e').join(e.alias('m'), Emp ↔Manager

Topic SQL PySpark Notes

ArrayType SELECT array('a','b','c') AS arr from pyspark.sql.types import Ordered list

MapType SELECT map('k1','v1','k2','v2') from pyspark.sql.types import Key-value

Topic SQL PySpark Notes

groupBy agg SELECT dept, AVG(salary) AS df.groupBy('dept').agg(F.avg('sa Multiple aggs

distinct SELECT DISTINCT dept FROM emp df.select('dept').distinct() Unique rows

dropDuplicates SELECT DISTINCT * FROM emp -- df.dropDuplicates(['id']) # or Distinct subset

Unions & Set Operations

Topic SQL PySpark Notes

UNION SELECT * FROM t1 UNION SELECT * t1.union(t2).distinct() Remove duplicates

unionByName -- N/A (engine-specific) t1.unionByName(t2, Match by column

Topic SQL PySpark Notes

fillna SELECT COALESCE(col, 0) AS col df.fillna({'col':0}) Replace nulls

Pivot & Unpivot

Topic SQL PySpark Notes

Higher-Order Functions & Transform

Topic SQL PySpark Notes

filter (arrays) SELECT filter(arr, x -> x > 0) df.select(F.expr('filter(arr, x Predicate on arrays

aggregate SELECT aggregate(arr, 0, (acc, df.select(F.expr('aggregate(arr, Fold/reduce

Extras & Tips

Topic SQL PySpark Notes

repartition/coal -- N/A df.repartition(8); Shuffle vs no-shuffle

write -- N/A df.write.partitionBy('year','mon Layout for pruning

You might also like