Aim: EDA Assignment
Source Code:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load Titanic dataset
df = sns.load_dataset('titanic')
# Q1: Display the number of missing values in each column
print("Missing values in each column:\n")
print(df.isnull().sum())
# Q2: Fill missing values
df['age'] = df['age'].fillna(df['age'].mean())
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
# Q3: Drop the 'deck' column
df.drop(columns=['deck'], inplace=True)
# Q4: Univariate Analysis - Distribution of age and fare
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['age'], kde=True, bins=30)
plt.title('Age Distribution')
plt.subplot(1, 2, 2)
sns.histplot(df['fare'], kde=True, bins=30)
plt.title('Fare Distribution')
plt.tight_layout()
plt.show()
# Q5: Count plot - Number of passengers by sex
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='sex')
plt.title("Passenger Count by Sex")
plt.show()
# Q6: Count plot - Survival count by gender (hue = survived)
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='sex', hue='survived')
plt.title("Survival Count by Gender")
plt.show()
# Q7: Box plot - Fare across Pclass categories
plt.figure(figsize=(6, 4))
sns.boxplot(data=df, x='pclass', y='fare')
plt.title("Fare Distribution by Pclass")
plt.show()
# Q8: Create new feature 'family_size'
df['family_size'] = df['sibsp'] + df['parch'] + 1
print("\nQ8. Sample of new 'family_size' feature:\n")
print(df[['sibsp', 'parch', 'family_size']].head())
# Q9: Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm',
fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()
# Q10: Observations
print("\nObservations:")
print("1. Females had a significantly higher survival rate than males.")
print("2. Passengers in 1st class paid higher fares and had a higher survival
rate.")
Output: