import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
salarydf=carsdf=pd.read_csv("Salary_Data.csv")
salarydf.shape
(30, 2)
salarydf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 YearsExperience 30 non-null float64
1 Salary 30 non-null int64
dtypes: float64(1), int64(1)
memory usage: 612.0 bytes
salarydf.dtypes
YearsExperience float64
Salary int64
dtype: object
salarydf.head()
YearsExperience Salary
0 1.1 39343
1 1.3 46205
2 1.5 37731
3 2.0 43525
4 2.2 39891
salarydf.tail()
YearsExperience Salary
25 9.0 105582
26 9.5 116969
27 9.6 112635
28 10.3 122391
29 10.5 121872
salarydf.describe()
YearsExperience Salary
count 30.000000 30.000000
mean 5.313333 76003.000000
std 2.837888 27414.429785
min 1.100000 37731.000000
25% 3.200000 56720.750000
50% 4.700000 65237.000000
75% 7.700000 100544.750000
max 10.500000 122391.000000
plt.scatter(data=salarydf,x='YearsExperience',y='Salary',s=10)
plt.title("Salary based on experience")
plt.xlabel("exp")
plt.ylabel("salary")
plt.show()
x=salarydf.loc[:,"YearsExperience"].values
y=salarydf.loc[:,"Salary"].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,
test_size=0.3, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((21,), (9,), (21,), (9,))
from sklearn.linear_model import LinearRegression
regmodel=LinearRegression()
regmodel.fit(X_train.reshape(-1,1),y_train.reshape(-1,1))
LinearRegression()
regmodel.coef_
array([[9360.26128619]])
regmodel.intercept_
array([26777.3913412])
from sklearn.metrics import r2_score
rsquare=regmodel.score(X_train.reshape(-1,1),y_train.reshape(-1,1))
rsquare
0.9423777652193379
y_predict=regmodel.predict(X_test.reshape(-1,1))
y_predict
array([[ 40817.78327049],
[123188.08258899],
[ 65154.46261459],
[ 63282.41035735],
[115699.87356004],
[108211.66453108],
[116635.89968866],
[ 64218.43648597],
[ 76386.77615802]])
rsquare=r2_score(y_test.reshape(-1,1),y_predict)
rsquare
0.9740993407213511
sns.lmplot(data=salarydf,x="YearsExperience",y="Salary")
plt.show()