I IMPLEMENTATION OF REGRESSION
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pandas as pd
# Load the California Housing dataset
housing = fetch_california_housing()
# Convert the dataset into a DataFrame for easier inspection
housing_df = pd.DataFrame(housing.data,
columns=housing.feature_names)
# Display the shape of the dataset
print(f"Shape of the dataset: {housing_df.shape}")
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(housing_df.head())
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(housing.data,
housing.target, test_size=0.2, random_state=42)
# Train a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
# Make predictions on the testing set
y_pred_lr = lr.predict(X_test)
# Calculate the mean squared error for the linear regression
model
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression Mean Squared Error: {mse_lr:.2f}")
# Train a random forest regression model
rf = RandomForestRegressor(n_estimators=100,
random_state=42)
rf.fit(X_train, y_train)
# Make predictions on the testing set
y_pred_rf = rf.predict(X_test)
# Calculate the mean squared error for the random forest
regression model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error: {mse_rf:.2f}")
# Train a Support Vector Regression model
svr = SVR()
svr.fit(X_train, y_train)
# Make predictions on the testing set
y_pred_svr = svr.predict(X_test)
# Calculate the mean squared error for the SVR model
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"Support Vector Regression Mean Squared Error:
{mse_svr:.2f}")
# Plotting the comparison of the models
models = ['Linear Regression', 'Random Forest', 'Support Vector
Regression']
mse_values = [mse_lr, mse_rf, mse_svr]
# Bar chart of Mean Squared Errors for all models
plt.figure(figsize=(8, 6))
plt.bar(models, mse_values, color=['blue', 'green', 'orange'])
plt.title('Mean Squared Error Comparison')
plt.ylabel('Mean Squared Error')
plt.show()
# Plotting predicted vs true values for all models
plt.figure(figsize=(18, 6))
# Linear Regression
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred_lr, color='blue', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)],
color='red', linestyle='--')
plt.title('Linear Regression: Predicted vs True')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
# Random Forest
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred_rf, color='green', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)],
color='red', linestyle='--')
plt.title('Random Forest: Predicted vs True')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
# Support Vector Regression
plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred_svr, color='orange', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)],
color='red', linestyle='--')
plt.title('SVR: Predicted vs True')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.tight_layout()
plt.show()
output :
Shape of the dataset: (20640, 8)
First few rows of the dataset:
MedInc HouseAge AveRooms AveBedrms Population
AveOccup Latitude \
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556
37.88
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842
37.86
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260
37.85
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945
37.85
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467
37.85
Longitude
0 -122.23
1 -122.22
2 -122.24
3 -122.25
4 -122.25
Linear Regression Mean Squared Error: 0.56
Random Forest Mean Squared Error: 0.26
Support Vector Regression Mean Squared Error: 1.33
SVR