SQL vs PySpark – Transformations Cheat
Sheet
Professional, friendly, and compact reference for everyday data engineering tasks. Save & share. ■
Column Operations & Conditionals
Topic SQL PySpark Notes
SELECT *, salary*1.1 AS df = df.withColumn('new_salary',
withColumn Create/replace
new_salary FROM emp F.col('salary')*1.1)
column
withColumnRe SELECT emp_id AS id FROM emp df = df.withColumnRenamed('emp_i Single-column
named d','id') rename
select SELECT id, name FROM emp df2 = df.select('id','name') Projection
filter/where SELECT * FROM emp WHERE df.filter(F.col('dept')=='IT') Row filter
dept='IT'
when/otherwis SELECT CASE WHEN gender='m' THEN df.withColumn('g', F.when(F.col( Conditional
e 'male' WHEN gender='f' THEN 'gender')=='m','male').when(F.co
'female' ELSE 'unknown' END AS g l('gender')=='f','female').other
FROM emp wise('unknown'))
Joins
Topic SQL PySpark Notes
SELECT e.* , d.dept_name FROM e.join(d, e.dept_id==d.id,
Inner Join Default join
emp e JOIN dept d ON 'inner')
e.dept_id=d.id
Left Join SELECT e.*, d.dept_name FROM emp e.join(d, e.dept_id==d.id, Keep all left rows
e LEFT JOIN dept d ON 'left')
e.dept_id=d.id
Right Join SELECT e.*, d.dept_name FROM emp e.join(d, e.dept_id==d.id, Keep all right rows
e RIGHT JOIN dept d ON 'right')
e.dept_id=d.id
Full Join SELECT * FROM emp e FULL OUTER e.join(d, e.dept_id==d.id, All rows from both
JOIN dept d ON e.dept_id=d.id 'outer')
Cross Join SELECT * FROM emp CROSS JOIN e.crossJoin(d) Cartesian product
dept
Self Join SELECT e.name, m.name AS manager e.alias('e').join(e.alias('m'), Emp ↔Manager
FROM emp e LEFT JOIN emp m ON F.col('e.manager_id')==F.col('m.
e.manager_id=m.id id'),
'left').select(F.col('e.name'),
F.col('m.name').alias('manager')
)
Special Data Types
Topic SQL PySpark Notes
-- Nested columns depend on from pyspark.sql.types import Nested record
StructType engine -- Example in Spark SQL StructType, StructField,
SELECT named_struct('city', StringType schema =
city, 'zip', zip) AS addr FROM t StructType([StructField('city',
StringType()),
StructField('zip',
StringType())])
ArrayType SELECT array('a','b','c') AS arr from pyspark.sql.types import Ordered list
ArrayType, StringType
ArrayType(StringType())
MapType SELECT map('k1','v1','k2','v2') from pyspark.sql.types import Key-value
AS m MapType MapType(StringType(),
StringType())
Aggregations
Topic SQL PySpark Notes
SELECT dept, COUNT(*) AS cnt df.groupBy('dept').count() Basic group
groupBy FROM emp GROUP BY dept
groupBy agg SELECT dept, AVG(salary) AS df.groupBy('dept').agg(F.avg('sa Multiple aggs
avg_sal, MAX(salary) AS max_sal lary').alias('avg_sal'), F.max('
FROM emp GROUP BY dept salary').alias('max_sal'))
distinct SELECT DISTINCT dept FROM emp df.select('dept').distinct() Unique rows
dropDuplicates SELECT DISTINCT * FROM emp -- df.dropDuplicates(['id']) # or Distinct subset
(conceptual) no cols for all
Unions & Set Operations
Topic SQL PySpark Notes
SELECT * FROM t1 UNION ALL t1.union(t2) Union w/ duplicates
UNION ALL SELECT * FROM t2
UNION SELECT * FROM t1 UNION SELECT * t1.union(t2).distinct() Remove duplicates
(distinct) FROM t2
unionByName -- N/A (engine-specific) t1.unionByName(t2, Match by column
allowMissingColumns=True) name
Null Handling
Topic SQL PySpark Notes
dropna SELECT * FROM t WHERE col IS NOT df.dropna(subset=['col']) # or Drop nulls
NULL how='all'/'any'
fillna SELECT COALESCE(col, 0) AS col df.fillna({'col':0}) Replace nulls
FROM t
Pivot & Unpivot
Topic SQL PySpark Notes
-- Many SQL engines SELECT dept, df.groupBy('dept').pivot('gender Wide from long
Pivot SUM(CASE WHEN gender='M' THEN 1 ',['M','F']).count()
ELSE 0 END) AS M, SUM(CASE WHEN
gender='F' THEN 1 ELSE 0 END) AS
F FROM emp GROUP BY dept
Unpivot -- UNPIVOT or CROSS APPLY VALUES df.selectExpr('dept', "stack(2, Long from wide (via
(engine-specific) 'M', M, 'F', F) as (gender, stack)
cnt)")
Higher-Order Functions & Transform
Topic SQL PySpark Notes
SELECT transform(arr, x -> x + df.select(F.expr('transform(arr, Element-wise
transform 1) AS arr2 FROM t x -> x + 1)').alias('arr2'))
(arrays)
filter (arrays) SELECT filter(arr, x -> x > 0) df.select(F.expr('filter(arr, x Predicate on arrays
AS arr2 FROM t -> x > 0)').alias('arr2'))
aggregate SELECT aggregate(arr, 0, (acc, df.select(F.expr('aggregate(arr, Fold/reduce
(arrays) x) -> acc + x) AS s FROM t 0, (acc, x) -> acc +
x)').alias('s'))
Extras & Tips
Topic SQL PySpark Notes
-- Engine-specific df.explain(True) View logical/physical
explain plan
repartition/coal -- N/A df.repartition(8); Shuffle vs no-shuffle
esce df.coalesce(2)
write -- N/A df.write.partitionBy('year','mon Layout for pruning
partitionBy th').parquet(path)