from google.
colab import files
uploaded = files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current
browser session. Please rerun this cell to enable.
Saving cardio_train.txt to cardio_train.txt
df = pd.read_csv('cardio_train.txt', sep=';')
# Display first 5 rows
print(df.head())
# Dataset info
print(df.info())
# Check for missing values
print(df.isnull().sum())
id age gender height weight ap_hi ap_lo cholesterol gluc smoke \
0 0 18393 2 168 62.0 110 80 1 1 0
1 1 20228 1 156 85.0 140 90 3 1 0
2 2 18857 1 165 64.0 130 70 3 1 0
3 3 17623 2 169 82.0 150 100 1 1 0
4 4 17474 1 156 56.0 100 60 1 1 0
alco active cardio
0 0 1 0
1 0 1 1
2 0 0 1
3 0 1 1
4 0 0 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 70000 non-null int64
1 age 70000 non-null int64
2 gender 70000 non-null int64
3 height 70000 non-null int64
4 weight 70000 non-null float64
5 ap_hi 70000 non-null int64
6 ap_lo 70000 non-null int64
7 cholesterol 70000 non-null int64
8 gluc 70000 non-null int64
9 smoke 70000 non-null int64
10 alco 70000 non-null int64
11 active 70000 non-null int64
12 cardio 70000 non-null int64
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None
id 0
age 0
gender 0
height 0
weight 0
ap_hi 0
ap_lo 0
cholesterol 0
gluc 0
smoke 0
alco 0
active 0
cardio 0
dtype: int64
# Summary statistics
print(df.describe())
id age gender height weight \
count 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000
mean 49972.419900 19468.865814 1.349571 164.359229 74.205690
std 28851.302323 2467.251667 0.476838 8.210126 14.395757
min 0.000000 10798.000000 1.000000 55.000000 10.000000
25% 25006.750000 17664.000000 1.000000 159.000000 65.000000
50% 50001.500000 19703.000000 1.000000 165.000000 72.000000
75% 74889.250000 21327.000000 2.000000 170.000000 82.000000
max 99999.000000 23713.000000 2.000000 250.000000 200.000000
ap_hi ap_lo cholesterol gluc smoke \
count 70000.000000 70000.000000 70000.000000 70000.000000 70000.000000
mean 128.817286 96.630414 1.366871 1.226457 0.088129
std 154.011419 188.472530 0.680250 0.572270 0.283484
min -150.000000 -70.000000 1.000000 1.000000 0.000000
25% 120.000000 80.000000 1.000000 1.000000 0.000000
50% 120.000000 80.000000 1.000000 1.000000 0.000000
75% 140.000000 90.000000 2.000000 1.000000 0.000000
max 16020.000000 11000.000000 3.000000 3.000000 1.000000
alco active cardio
count 70000.000000 70000.000000 70000.000000
mean 0.053771 0.803729 0.499700
std 0.225568 0.397179 0.500003
min 0.000000 0.000000 0.000000
25% 0.000000 1.000000 0.000000
50% 0.000000 1.000000 0.000000
75% 0.000000 1.000000 1.000000
max 1.000000 1.000000 1.000000
# Drop 'id' column if it exists
if 'id' in df.columns:
df.drop('id', axis=1, inplace=True)
df['age'] = (df['age'] / 365).astype(int)
import seaborn as sns
import matplotlib.pyplot as plt
# Check if 'cardio' column exists
if 'cardio' in df.columns:
sns.countplot(x='cardio', data=df)
plt.title('Distribution of Cardiovascular Disease')
plt.xlabel('Cardiovascular Disease (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()
else:
print("Column 'cardio' not found.")
# Compute the correlation matrix
correlation_matrix = df.corr()
# Set up the matplotlib figure
plt.figure(figsize=(12, 10))
# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()
from sklearn.model_selection import train_test_split
# Define the features and target
X = df.drop(columns=['cardio']) # Features
y = df['cardio'] # Target
# Split into training and test datasets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
Training features shape: (56000, 11)
Test features shape: (14000, 11)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize and train the model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
# Predict on test data
y_pred_lr = lr_model.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
Accuracy: 0.7240714285714286
Confusion Matrix:
[[5360 1628]
[2235 4777]]
Classification Report:
precision recall f1-score support
0 0.71 0.77 0.74 6988
1 0.75 0.68 0.71 7012
accuracy 0.72 14000
macro avg 0.73 0.72 0.72 14000
weighted avg 0.73 0.72 0.72 14000
/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:465: Co
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regressio
n_iter_i = _check_optimize_result(
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred_lr)
# Plot the heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()
from sklearn.svm import SVC
# Initialize and train the model
svm_model = SVC()
svm_model.fit(X_train, y_train)
# Predict on test data
y_pred_svm = svm_model.predict(X_test)
# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
# Visualize confusion matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Oranges')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - SVM')
plt.show()
SVM Accuracy: 0.7228571428571429
Confusion Matrix:
[[5723 1265]
[2615 4397]]
Classification Report:
precision recall f1-score support
0 0.69 0.82 0.75 6988
1 0.78 0.63 0.69 7012
accuracy 0.72 14000
macro avg 0.73 0.72 0.72 14000
weighted avg 0.73 0.72 0.72 14000
from sklearn.neighbors import KNeighborsClassifier
# Initialize and train the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
# Predict on test data
y_pred_knn = knn_model.predict(X_test)
# Evaluate the model
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))
# Visualize confusion matrix
cm_knn = confusion_matrix(y_test, y_pred_knn)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Purples')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - KNN')
plt.show()
KNN Accuracy: 0.6846428571428571
Confusion Matrix:
[[4944 2044]
[2371 4641]]
Classification Report:
precision recall f1-score support
0 0.68 0.71 0.69 6988
1 0.69 0.66 0.68 7012
accuracy 0.68 14000
macro avg 0.69 0.68 0.68 14000
weighted avg 0.69 0.68 0.68 14000
from sklearn.tree import DecisionTreeClassifier
# Initialize and train the model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
# Predict on test data
y_pred_dt = dt_model.predict(X_test)
# Evaluate the model
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
# Visualize confusion matrix
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Oranges')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Decision Tree')
plt.show()
Decision Tree Accuracy: 0.6347142857142857
Confusion Matrix:
[[4550 2438]
[2676 4336]]
Classification Report:
precision recall f1-score support
0 0.63 0.65 0.64 6988
1 0.64 0.62 0.63 7012
accuracy 0.63 14000
macro avg 0.63 0.63 0.63 14000
weighted avg 0.63 0.63 0.63 14000
from sklearn.svm import SVC
# Initialize and train the model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)
# Predict on test data
y_pred_svm = svm_model.predict(X_test)
# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
# Visualize confusion matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_svm, annot=True, fmt='d', cmap='Purples')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - SVM')
plt.show()
SVM Accuracy: 0.7228571428571429
Confusion Matrix:
[[5723 1265]
[2615 4397]]
Classification Report:
precision recall f1-score support
0 0.69 0.82 0.75 6988
1 0.78 0.63 0.69 7012
accuracy 0.72 14000
macro avg 0.73 0.72 0.72 14000
weighted avg 0.73 0.72 0.72 14000
from sklearn.svm import SVC
# Initialize and train the model
svm_model = SVC()
svm_model.fit(X_train, y_train)
# Predict on test data
y_pred_svm = svm_model.predict(X_test)
# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))
SVM Accuracy: 0.7228571428571429
Confusion Matrix:
[[5723 1265]
[2615 4397]]
Classification Report:
precision recall f1-score support
0 0.69 0.82 0.75 6988
1 0.78 0.63 0.69 7012
accuracy 0.72 14000
macro avg 0.73 0.72 0.72 14000
weighted avg 0.73 0.72 0.72 14000
from sklearn.tree import DecisionTreeClassifier
# Initialize and train the model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
# Predict on test data
y_pred_dt = dt_model.predict(X_test)
# Evaluate the model
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
Decision Tree Accuracy: 0.6347142857142857
Confusion Matrix:
[[4550 2438]
[2676 4336]]
Classification Report:
precision recall f1-score support
0 0.63 0.65 0.64 6988
1 0.64 0.62 0.63 7012
accuracy 0.63 14000
macro avg 0.63 0.63 0.63 14000
weighted avg 0.63 0.63 0.63 14000
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Predict on test data
y_pred_rf = rf_model.predict(X_test)
# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
Random Forest Accuracy: 0.7057142857142857
Confusion Matrix:
[[4947 2041]
[2079 4933]]
Classification Report:
precision recall f1-score support
0 0.70 0.71 0.71 6988
1 0.71 0.70 0.71 7012
accuracy 0.71 14000
macro avg 0.71 0.71 0.71 14000
weighted avg 0.71 0.71 0.71 14000
# Dictionary of model names and their accuracy scores
model_accuracies = {
'Logistic Regression': accuracy_score(y_test, y_pred_lr),
'Decision Tree': accuracy_score(y_test, y_pred_dt),
'Random Forest': accuracy_score(y_test, y_pred_rf),
'SVM': accuracy_score(y_test, y_pred_svm),
'KNN': accuracy_score(y_test, y_pred_knn)
}
# Display model comparison
for model, acc in model_accuracies.items():
print(f'{model}: {acc:.4f}')
Logistic Regression: 0.7241
Decision Tree: 0.6347
Random Forest: 0.7057
SVM: 0.7229
KNN: 0.6846
# Bar plot to visualize accuracies
plt.figure(figsize=(10, 6))
plt.bar(model_accuracies.keys(), model_accuracies.values(), color='skyblue')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylim(0.5, 1)
plt.show()
# Get feature importances from Random Forest model
importances = rf_model.feature_importances_
feature_names = X.columns
# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# Display top features
print(feature_importance_df.head(10))
Feature Importance
3 weight 0.230041
2 height 0.211850
4 ap_hi 0.186320
0 age 0.164083
5 ap_lo 0.096743
6 cholesterol 0.040225
1 gender 0.019445
7 gluc 0.018327
10 active 0.015191
8 smoke 0.009663
# Bar plot of feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10), palette='virid
plt.title('Top 10 Important Features - Random Forest')
plt.tight_layout()
plt.show()
<ipython-input-28-696a840a30af>:3: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10),
import joblib
# Save the model
joblib.dump(rf_model, 'random_forest_cardio_model.pkl')
print("Model saved successfully!")
Model saved successfully!
# Load the model
loaded_model = joblib.load('random_forest_cardio_model.pkl')
# Example prediction
sample_prediction = loaded_model.predict(X_test[:5])
print("Sample predictions:", sample_prediction)
Sample predictions: [1 1 1 0 0]
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [100, 200],
'max_depth': [10, 20],
'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best score: 0.7348035714285714
import matplotlib.pyplot as plt
import seaborn as sns
feature_importance = rf_model.feature_importances_
features = X.columns
sorted_idx = feature_importance.argsort()
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance[sorted_idx], y=features[sorted_idx])
plt.title("Feature Importance - Random Forest")
plt.show()
predictions = rf_model.predict(X_test)
output = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
output.to_csv('predictions.csv', index=False)
print("Predictions saved.")
Predictions saved.
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define the model
rf = RandomForestClassifier(random_state=42)
...
Ellipsis
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define the model
rf = RandomForestClassifier(random_state=42)
# Define parameter grid
param_grid = {
'n_estimators': [100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
}
# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
# Assign the best estimator to best_rf
best_rf = grid_search.best_estimator_
Fitting 3 folds for each of 24 candidates, totalling 72 fits
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
# Predict probabilities
y_proba = best_rf.predict_proba(X_test)[:, 1]
# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(roc_au
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic - Random Forest')
plt.legend(loc="lower right")
plt.grid()
plt.show()
import joblib
# Save the best Random Forest model to a file