#
Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Importing the dataset
df = pd.read_csv('MBA Salary.csv')
df.head()
S. No. Percentage in Grade 10 Salary
0 1 62.00 270000
1 2 76.33 200000
2 3 72.00 240000
3 4 60.00 250000
4 5 61.00 180000
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 S. No. 50 non-null int64
1 Percentage in Grade 10 50 non-null float64
2 Salary 50 non-null int64
dtypes: float64(1), int64(2)
memory usage: 1.3 KB
print(df.shape)
(50, 3)
# View descriptive statistics
print(df.describe())
S. No. Percentage in Grade 10 Salary
count 50.00000 50.000000 50.000000
mean 25.50000 63.922400 258192.000000
std 14.57738 9.859937 76715.790993
min 1.00000 37.330000 120000.000000
25% 13.25000 57.685000 204500.000000
50% 25.50000 64.700000 250000.000000
75% 37.75000 70.000000 300000.000000
max 50.00000 83.000000 450000.000000
# Declare feature variable and target variable
X = df['Percentage in Grade 10']
y = df['Salary']
# Plot scatter plot between X and y
plt.scatter(X, y, color = 'blue', label='Scatter Plot')
plt.title('Relationship between Grades and Salary of a person')
plt.xlabel('Percentage in Grade 10')
plt.ylabel('Salary')
plt.legend(loc=4)
plt.show()
# Print the dimensions of X and y
print(X.shape)
print(y.shape)
(50,)
(50,)
0 62.00
1 76.33
2 72.00
3 60.00
4 61.00
5 55.00
6 70.00
7 68.00
8 82.80
9 59.00
10 58.00
11 60.00
12 66.00
13 83.00
14 68.00
15 37.33
16 79.00
17 68.40
18 70.00
19 59.00
20 63.00
21 50.00
22 69.00
23 52.00
24 49.00
25 64.60
26 50.00
27 74.00
28 58.00
29 67.00
30 75.00
31 60.00
32 55.00
33 78.00
34 50.08
35 56.00
36 68.00
37 52.00
38 54.00
39 52.00
40 76.00
41 64.80
42 74.40
43 74.50
44 73.50
45 57.58
46 68.00
47 69.00
48 66.00
49 60.80
Name: Percentage in Grade 10, dtype: float64
X=np.array(X)
y=np.array(y)
array([62. , 76.33, 72. , 60. , 61. , 55. , 70. , 68. , 82.8 ,
59. , 58. , 60. , 66. , 83. , 68. , 37.33, 79. , 68.4 ,
70. , 59. , 63. , 50. , 69. , 52. , 49. , 64.6 , 50. ,
74. , 58. , 67. , 75. , 60. , 55. , 78. , 50.08, 56. ,
68. , 52. , 54. , 52. , 76. , 64.8 , 74.4 , 74.5 , 73.5 ,
57.58, 68. , 69. , 66. , 60.8 ])
# Reshape X and y
X = X.reshape(-1,1)
y = y.reshape(-1,1)
# Print the dimensions of X and y after reshaping
print(X.shape)
print(y.shape)
(50,)
(50,)
# Split X and y into training and test data sets
#random_state--the set of data does not change
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.30, random_state=42)
# Print the dimensions of X_train,X_test,y_train,y_test
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(33, 1)
(33, 1)
(17, 1)
(17, 1)
# Fit the linear model
# Instantiate the linear regression object lm
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
# Train the model using training data sets
lm.fit(X_train,y_train)
# Predict on the test data
y_pred=lm.predict(X_test)
# Visualising the Training set results
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, lm.predict(X_train), color = 'blue')
[<matplotlib.lines.Line2D at 0x22c2d23c430>]
# Visualising the Test set results
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, lm.predict(X_test), color = 'blue')
plt.title('Test set results')
plt.xlabel('Grades')
plt.ylabel('Salary')
plt.show()
# Compute model slope and intercept
slope = lm.coef_
intercept = lm.intercept_,
print("Estimated model slope:" , slope)
print("Estimated model intercept:" , intercept)
Estimated model slope: [[1504.41195413]]
Estimated model intercept: (array([152845.01374103]),)
X_new = [[80]]
lm.predict(X_new)
array([[273197.97007155]])
Colab paid products
-
Cancel contracts here