ml-regression
October 23, 2024
[3]: # Drop the unnecessary index column
import pandas as pd
data = pd.read_csv('/content/car data.csv')
# Check for missing values or inconsistencies
missing_values = data.isnull().sum()
data_summary = data.describe()
missing_values, data_summary
[3]: (Car_Name 0
Year 0
Selling_Price 0
Present_Price 0
Kms_Driven 0
Fuel_Type 0
Seller_Type 0
Transmission 0
Owner 0
dtype: int64,
Year Selling_Price Present_Price Kms_Driven Owner
count 301.000000 301.000000 301.000000 301.000000 301.000000
mean 2013.627907 4.661296 7.628472 36947.205980 0.043189
std 2.891554 5.082812 8.644115 38886.883882 0.247915
min 2003.000000 0.100000 0.320000 500.000000 0.000000
25% 2012.000000 0.900000 1.200000 15000.000000 0.000000
50% 2014.000000 3.600000 6.400000 32000.000000 0.000000
75% 2016.000000 6.000000 9.900000 48767.000000 0.000000
max 2018.000000 35.000000 92.600000 500000.000000 3.000000)
[8]: import numpy as np
# Split the data into training and testing sets
X_car = pd.get_dummies(data[['Owner','Kms_Driven']], drop_first=True)
y = data['Selling_Price']
1
def train_test_split_manual(X, y, test_size=0.2, random_state=None):
if random_state is not None:
np.random.seed(random_state)
indices = np.random.permutation(len(X))
test_set_size = int(len(X) * test_size)
test_indices = indices[:test_set_size]
train_indices = indices[test_set_size:]
return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices],␣
↪y.iloc[test_indices]
X_train, X_test, y_train, y_test = train_test_split_manual(X_car, y,␣
↪test_size=0.2, random_state=42)
# Implementing a simple linear regression model
class SimpleLinearRegression:
def __init__(self):
self.coefficients = None
def fit(self, X, y):
# Add a bias term (column of 1s) to the input matrix
X = np.c_[np.ones(X.shape[0]), X]
# Compute coefficients using the normal equation
self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
def predict(self, X):
# Add the bias term to the input matrix
X = np.c_[np.ones(X.shape[0]), X]
# Predict using the learned coefficients
return X.dot(self.coefficients)
# Train the linear regression model
model_car = SimpleLinearRegression()
model_car.fit(X_train, y_train)
y_car = model_car.predict(X_test)
def calculate_rmse(y_true, y_pred):
return np.sqrt(np.mean((y_true - y_pred) ** 2))
# Calculate RMSE
rmse_car = calculate_rmse(y_test, y_car)
# Print the predictions for the test set
predicted_sales = pd.DataFrame({'Actual Selling Price': y_test, 'Predicted␣
↪Price': y_car})
# Display the first few rows of the predictions
2
print(predicted_sales.head())
print(f'RSME score: {rmse_car}')
Actual Selling Price Predicted Price
177 0.35 4.638736
289 10.11 4.570413
228 4.95 4.827647
198 0.15 2.985661
60 6.95 4.722701
RSME score: 4.786033522433056