In [117… import pandas as pd
In [119… import matplotlib as plt
In [121… df=pd.read_csv('HousingData.csv')
In [123… df
Out[123… CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.9
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.9
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.8
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.6
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.9
... ... ... ... ... ... ... ... ... ... ... ... .
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1 273 21.0 391.9
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273 21.0 396.9
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273 21.0 396.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273 21.0 393.4
505 0.04741 0.0 11.93 0.0 0.573 6.030 NaN 2.5050 1 273 21.0 396.9
506 rows × 14 columns
In [125… df.describe()
Out[125… CRIM ZN INDUS CHAS NOX RM A
count 486.000000 486.000000 486.000000 486.000000 506.000000 506.000000 486.0000
mean 3.611874 11.211934 11.083992 0.069959 0.554695 6.284634 68.5185
std 8.720192 23.388876 6.835896 0.255340 0.115878 0.702617 27.9995
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.9000
25% 0.081900 0.000000 5.190000 0.000000 0.449000 5.885500 45.1750
50% 0.253715 0.000000 9.690000 0.000000 0.538000 6.208500 76.8000
75% 3.560263 12.500000 18.100000 0.000000 0.624000 6.623500 93.9750
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.0000
In [127… df.index
Out[127… RangeIndex(start=0, stop=506, step=1)
In [129… df.head
Out[129… <bound method NDFrame.head of CRIM ZN INDUS CHAS NOX RM AGE
DIS RAD TAX \
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222
.. ... ... ... ... ... ... ... ... ... ...
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1 273
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273
505 0.04741 0.0 11.93 0.0 0.573 6.030 NaN 2.5050 1 273
PTRATIO B LSTAT MEDV
0 15.3 396.90 4.98 24.0
1 17.8 396.90 9.14 21.6
2 17.8 392.83 4.03 34.7
3 18.7 394.63 2.94 33.4
4 18.7 396.90 NaN 36.2
.. ... ... ... ...
501 21.0 391.99 NaN 22.4
502 21.0 396.90 9.08 20.6
503 21.0 396.90 5.64 23.9
504 21.0 393.45 6.48 22.0
505 21.0 396.90 7.88 11.9
[506 rows x 14 columns]>
In [131… df.head()
Out[131… CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
In [133… df.tail()
Out[133… CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45
505 0.04741 0.0 11.93 0.0 0.573 6.030 NaN 2.5050 1 273 21.0 396.90
In [135… df1=df.dropna()
In [137… df1
Out[137… CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.9
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.9
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.8
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.6
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3 222 18.7 394.1
... ... ... ... ... ... ... ... ... ... ... ... .
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6 391 19.2 395.7
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6 391 19.2 396.9
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273 21.0 396.9
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273 21.0 396.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273 21.0 393.4
394 rows × 14 columns
In [139… df1.isnull()
Out[139… CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LS
0 False False False False False False False False False False False False F
1 False False False False False False False False False False False False F
2 False False False False False False False False False False False False F
3 False False False False False False False False False False False False F
5 False False False False False False False False False False False False F
... ... ... ... ... ... ... ... ... ... ... ... ...
499 False False False False False False False False False False False False F
500 False False False False False False False False False False False False F
502 False False False False False False False False False False False False F
503 False False False False False False False False False False False False F
504 False False False False False False False False False False False False F
394 rows × 14 columns
In [141… import numpy as np
In [143… df.columns
Out[143… Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT', 'MEDV'],
dtype='object')
In [145… x=df1[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
'PTRATIO', 'B', 'LSTAT']]
In [147… y=df1[['MEDV']]
In [149… x
Out[149… CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.9
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.9
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.8
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.6
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3 222 18.7 394.1
... ... ... ... ... ... ... ... ... ... ... ... .
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6 391 19.2 395.7
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6 391 19.2 396.9
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273 21.0 396.9
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273 21.0 396.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273 21.0 393.4
394 rows × 13 columns
In [151… y
Out[151… MEDV
0 24.0
1 21.6
2 34.7
3 33.4
5 28.7
... ...
499 17.5
500 16.8
502 20.6
503 23.9
504 22.0
394 rows × 1 columns
In [153… from sklearn.model_selection import train_test_split
In [155… from sklearn.linear_model import LinearRegression
In [157… x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)
In [159… lm = LinearRegression()
lm.fit(x_train,y_train)
Out[159… ▾ LinearRegression
LinearRegression()
In [161… predictions=lm.predict(x_test)
In [163… predictions
Out[163… array([[29.40315823],
[17.32130164],
[21.88438126],
[30.53897078],
[18.74211111],
[34.49957016],
[22.18772843],
[30.40016057],
[33.30285959],
[15.07065834],
[22.15266803],
[40.7986985 ],
[21.81856333],
[16.57492925],
[19.06901469],
[20.36048341],
[17.0094124 ],
[15.281753 ],
[22.57233525],
[14.31888726],
[18.48367439],
[20.59950781],
[17.02508296],
[29.32482063],
[26.16559295],
[16.14119305],
[27.05876571],
[31.80271449],
[22.91935834],
[27.06461644],
[40.9285908 ],
[18.23395575],
[22.32757596],
[17.40043581],
[17.75100984],
[21.14924374],
[22.00396948],
[21.66088774],
[23.1598697 ],
[20.86075164],
[27.6556627 ],
[34.50415574],
[22.12517086],
[30.76421403],
[34.86183582],
[19.95497559],
[24.94727208],
[10.54144025],
[19.61633902],
[25.25533113],
[21.82692573],
[26.1224415 ],
[14.58753912],
[18.60907444],
[18.73269469],
[23.877063 ],
[43.39153407],
[22.4868859 ],
[15.58046183],
[23.42839246],
[21.35028809],
[21.41783737],
[14.59575339],
[28.83213251],
[-3.15989071],
[32.30293312],
[16.84715194],
[31.11268272],
[24.82335627],
[19.94867799],
[31.74110086],
[32.10741168],
[19.29684222],
[19.39763513],
[19.26122628],
[35.15713713],
[19.27819376],
[28.30029199],
[16.64597387],
[16.86726667],
[36.9666121 ],
[23.88353751],
[24.02647044],
[12.90795354],
[28.61288195],
[20.95440577],
[15.33870473],
[12.28771055],
[38.47633494],
[37.07379058],
[20.50728689],
[18.65066319],
[27.44199919],
[16.08244976],
[26.83793964],
[38.72160447],
[30.53333731],
[22.90723493],
[21.57563759]])
In [183… from sklearn import metrics
mse=metrics.mean_squared_error(y_test,predictions)
In [185… np.sqrt(mse)
Out[185… 5.457269439843607
In [193… import matplotlib.pyplot as plt
import seaborn as sns
plt.scatter(y_test,predictions)
plt.xlabel("y_test")
plt.ylabel("y_predictions")
Out[193… Text(0, 0.5, 'y_predictions')
In [ ]: