2. Apply the following Pre-processing techniques for a given dataset.
a. Attribute selection
b. Handling Missing Values
c. Discretization
d. Elimination of Outliers
import pandas as pd
import numpy as np
import sklearn
dataset=pd.read_csv(r"C:\Users\91798\Downloads\Data1.csv -
Sheet1.csv")
dataset
Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
ATTRIBUTE SELECTION
df=dataset.copy()
x=df.iloc[:,:-1].values #Copies all values of all columns into 'x'
except the last column
y=df.iloc[:,-1].values #Copies all values of last column into y
print('Values in x: ',x)
print()
print('Values in y:',y)
Values in x: [['France' 44.0 72000.0]
['Spain' 27.0 48000.0]
['Germany' 30.0 54000.0]
['Spain' 38.0 61000.0]
['Germany' 40.0 nan]
['France' 35.0 58000.0]
['Spain' nan 52000.0]
['France' 48.0 79000.0]
['Germany' 50.0 83000.0]
['France' 37.0 67000.0]]
Values in y: ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']
HANDLING MISSING VALUES-- DROPPING NULL VALUES
print(df.isnull().sum()) #Checking for null values or missing data
#DROPPING NULL VALUES
df1=df.copy()
print('Before dropping NULL Values:')
print()
print(df1)
df1.dropna(inplace=True)
print()
print('After dropping NULL Values:')
print(df1)
Country 0
Age 1
Salary 1
Purchased 0
dtype: int64
Before dropping NULL Values:
Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
After dropping NULL Values:
Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
5 France 35.0 58000.0 Yes
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
HANDLING MISSING VALUES--FILLING NULL VALUES
df2=df.copy()
df2['Age']=df2['Age'].fillna(df2.Age.mean()) #Filling Null Values of
Age with mean value
df2['Salary']=df2['Salary'].fillna(df2.Salary.mean()) #Filling Null
values of Salary with mean value
df2
Country Age Salary Purchased
0 France 44.000000 72000.000000 No
1 Spain 27.000000 48000.000000 Yes
2 Germany 30.000000 54000.000000 No
3 Spain 38.000000 61000.000000 No
4 Germany 40.000000 63777.777778 Yes
5 France 35.000000 58000.000000 Yes
6 Spain 38.777778 52000.000000 No
7 France 48.000000 79000.000000 Yes
8 Germany 50.000000 83000.000000 No
9 France 37.000000 67000.000000 Yes
CONVERTING CATEGORICAL DATA INTO NUMERICAL DATA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),
[0])],remainder='passthrough')
X=np.array(ct.fit_transform(x))
print(df)
print('After Encoding:')
print(X)
Country Age Salary Purchased
0 France 44.0 72000.0 No
1 Spain 27.0 48000.0 Yes
2 Germany 30.0 54000.0 No
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
After Encoding:
[[1.0 0.0 0.0 44.0 72000.0]
[0.0 0.0 1.0 27.0 48000.0]
[0.0 1.0 0.0 30.0 54000.0]
[0.0 0.0 1.0 38.0 61000.0]
[0.0 1.0 0.0 40.0 nan]
[1.0 0.0 0.0 35.0 58000.0]
[0.0 0.0 1.0 nan 52000.0]
[1.0 0.0 0.0 48.0 79000.0]
[0.0 1.0 0.0 50.0 83000.0]
[1.0 0.0 0.0 37.0 67000.0]]
MIN&MAX SCALER
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,rando
m_state=1)
sta=StandardScaler()
X_train[:,3:]=sta.fit_transform(X_train[:,3:])
X_test[:,3:]=sta.transform(X_test[:,3:])
print(X_train[:,3:])
[[nan -1.0182239953527132]
[-0.03891021128204815 nan]
[0.5058327466666259 0.5834766714942514]
[-0.31128169025638514 -0.2974586952715791]
[-1.8093248246152385 -1.3385641287221062]
[1.0505757046152997 1.1440719048906889]
[1.3229471835896367 1.4644120382600818]
[-0.7198389087178906 -0.5377137952986237]]
print(X_test[:,3:])
[[30.000000000000004 54000.00000000001]
[37.00000000000001 67000.00000000001]]