import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix,
roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
# Load dataset
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# Drop customerID
df.drop('customerID', axis=1, inplace=True)
# Handle TotalCharges (has missing values)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],
errors='coerce')
df.dropna(inplace=True)
# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
# Convert binary categorical features
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService',
'PaperlessBilling']
for col in binary_cols:
df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})
# One-hot encode remaining categorical variables
df = pd.get_dummies(df, drop_first=True)
# Features and target
X = df.drop('Churn', axis=1)
y = df['Churn']
# Scale numerical features
scaler = StandardScaler()
X[['tenure', 'MonthlyCharges', 'TotalCharges']] =
scaler.fit_transform(X[['tenure', 'MonthlyCharges', 'TotalCharges']])
# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal,
test_size=0.2, random_state=42)
# --- 1. Logistic Regression ---
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred_log))
print("ROC-AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test)
[:, 1]))
print("-" * 60)
Logistic Regression:
precision recall f1-score support
0 0.81 0.78 0.80 1037
1 0.79 0.82 0.80 1029
accuracy 0.80 2066
macro avg 0.80 0.80 0.80 2066
weighted avg 0.80 0.80 0.80 2066
ROC-AUC: 0.880777135210056
------------------------------------------------------------
# --- 2. Random Forest ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,
1]))
print("-" * 60)
Random Forest:
precision recall f1-score support
0 0.84 0.82 0.83 1037
1 0.82 0.85 0.83 1029
accuracy 0.83 2066
macro avg 0.83 0.83 0.83 2066
weighted avg 0.83 0.83 0.83 2066
ROC-AUC: 0.9135555861688939
------------------------------------------------------------
# --- 3. XGBoost ---
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost Classifier:")
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,
1]))
print("-" * 60)
/usr/local/lib/python3.11/dist-packages/xgboost/core.py:158:
UserWarning: [14:42:51] WARNING: /workspace/src/learner.cc:740:
Parameters: { "use_label_encoder" } are not used.
warnings.warn(smsg, UserWarning)
XGBoost Classifier:
precision recall f1-score support
0 0.84 0.81 0.82 1037
1 0.81 0.84 0.83 1029
accuracy 0.83 2066
macro avg 0.83 0.83 0.83 2066
weighted avg 0.83 0.83 0.83 2066
ROC-AUC: 0.9048410933460034
------------------------------------------------------------
# Feature Importance from Random Forest
importances = rf.feature_importances_
indices = np.argsort(importances)[-10:]
features = X.columns[indices]
plt.figure(figsize=(10, 6))
plt.title("Top 10 Feature Importances (Random Forest)")
plt.barh(range(len(indices)), importances[indices], align="center")
plt.yticks(range(len(indices)), features)
plt.xlabel("Relative Importance")
plt.show()