[go: up one dir, main page]

0% found this document useful (0 votes)
21 views3 pages

Pyspark SQL Transformation Cheat Sheet

Uploaded by

Sujith
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views3 pages

Pyspark SQL Transformation Cheat Sheet

Uploaded by

Sujith
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 3

SQL vs PySpark – Transformations Cheat

Sheet
Professional, friendly, and compact reference for everyday data engineering tasks. Save & share. ■

Column Operations & Conditionals

Topic SQL PySpark Notes


SELECT *, salary*1.1 AS df = df.withColumn('new_salary',
withColumn Create/replace
new_salary FROM emp F.col('salary')*1.1)
column

withColumnRe SELECT emp_id AS id FROM emp df = df.withColumnRenamed('emp_i Single-column


named d','id') rename

select SELECT id, name FROM emp df2 = df.select('id','name') Projection

filter/where SELECT * FROM emp WHERE df.filter(F.col('dept')=='IT') Row filter


dept='IT'

when/otherwis SELECT CASE WHEN gender='m' THEN df.withColumn('g', F.when(F.col( Conditional


e 'male' WHEN gender='f' THEN 'gender')=='m','male').when(F.co
'female' ELSE 'unknown' END AS g l('gender')=='f','female').other
FROM emp wise('unknown'))

Joins

Topic SQL PySpark Notes


SELECT e.* , d.dept_name FROM e.join(d, e.dept_id==d.id,
Inner Join Default join
emp e JOIN dept d ON 'inner')
e.dept_id=d.id

Left Join SELECT e.*, d.dept_name FROM emp e.join(d, e.dept_id==d.id, Keep all left rows
e LEFT JOIN dept d ON 'left')
e.dept_id=d.id

Right Join SELECT e.*, d.dept_name FROM emp e.join(d, e.dept_id==d.id, Keep all right rows
e RIGHT JOIN dept d ON 'right')
e.dept_id=d.id

Full Join SELECT * FROM emp e FULL OUTER e.join(d, e.dept_id==d.id, All rows from both
JOIN dept d ON e.dept_id=d.id 'outer')

Cross Join SELECT * FROM emp CROSS JOIN e.crossJoin(d) Cartesian product
dept

Self Join SELECT e.name, m.name AS manager e.alias('e').join(e.alias('m'), Emp ↔Manager


FROM emp e LEFT JOIN emp m ON F.col('e.manager_id')==F.col('m.
e.manager_id=m.id id'),
'left').select(F.col('e.name'),
F.col('m.name').alias('manager')
)
Special Data Types

Topic SQL PySpark Notes


-- Nested columns depend on from pyspark.sql.types import Nested record
StructType engine -- Example in Spark SQL StructType, StructField,
SELECT named_struct('city', StringType schema =
city, 'zip', zip) AS addr FROM t StructType([StructField('city',
StringType()),
StructField('zip',
StringType())])

ArrayType SELECT array('a','b','c') AS arr from pyspark.sql.types import Ordered list


ArrayType, StringType
ArrayType(StringType())

MapType SELECT map('k1','v1','k2','v2') from pyspark.sql.types import Key-value


AS m MapType MapType(StringType(),
StringType())

Aggregations

Topic SQL PySpark Notes


SELECT dept, COUNT(*) AS cnt df.groupBy('dept').count() Basic group
groupBy FROM emp GROUP BY dept

groupBy agg SELECT dept, AVG(salary) AS df.groupBy('dept').agg(F.avg('sa Multiple aggs


avg_sal, MAX(salary) AS max_sal lary').alias('avg_sal'), F.max('
FROM emp GROUP BY dept salary').alias('max_sal'))

distinct SELECT DISTINCT dept FROM emp df.select('dept').distinct() Unique rows

dropDuplicates SELECT DISTINCT * FROM emp -- df.dropDuplicates(['id']) # or Distinct subset


(conceptual) no cols for all

Unions & Set Operations

Topic SQL PySpark Notes


SELECT * FROM t1 UNION ALL t1.union(t2) Union w/ duplicates
UNION ALL SELECT * FROM t2

UNION SELECT * FROM t1 UNION SELECT * t1.union(t2).distinct() Remove duplicates


(distinct) FROM t2

unionByName -- N/A (engine-specific) t1.unionByName(t2, Match by column


allowMissingColumns=True) name

Null Handling

Topic SQL PySpark Notes


dropna SELECT * FROM t WHERE col IS NOT df.dropna(subset=['col']) # or Drop nulls
NULL how='all'/'any'

fillna SELECT COALESCE(col, 0) AS col df.fillna({'col':0}) Replace nulls


FROM t

Pivot & Unpivot

Topic SQL PySpark Notes


-- Many SQL engines SELECT dept, df.groupBy('dept').pivot('gender Wide from long
Pivot SUM(CASE WHEN gender='M' THEN 1 ',['M','F']).count()
ELSE 0 END) AS M, SUM(CASE WHEN
gender='F' THEN 1 ELSE 0 END) AS
F FROM emp GROUP BY dept

Unpivot -- UNPIVOT or CROSS APPLY VALUES df.selectExpr('dept', "stack(2, Long from wide (via
(engine-specific) 'M', M, 'F', F) as (gender, stack)
cnt)")

Higher-Order Functions & Transform

Topic SQL PySpark Notes


SELECT transform(arr, x -> x + df.select(F.expr('transform(arr, Element-wise
transform 1) AS arr2 FROM t x -> x + 1)').alias('arr2'))
(arrays)

filter (arrays) SELECT filter(arr, x -> x > 0) df.select(F.expr('filter(arr, x Predicate on arrays


AS arr2 FROM t -> x > 0)').alias('arr2'))

aggregate SELECT aggregate(arr, 0, (acc, df.select(F.expr('aggregate(arr, Fold/reduce


(arrays) x) -> acc + x) AS s FROM t 0, (acc, x) -> acc +
x)').alias('s'))

Extras & Tips

Topic SQL PySpark Notes


-- Engine-specific df.explain(True) View logical/physical
explain plan

repartition/coal -- N/A df.repartition(8); Shuffle vs no-shuffle


esce df.coalesce(2)

write -- N/A df.write.partitionBy('year','mon Layout for pruning


partitionBy th').parquet(path)

You might also like