Task 1 - Implementation of Decision Tree Algorithm
Code:-
import sys
import matplotlib
matplotlib.use('Agg')
import pandas as pd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
df = pd.read_csv("user_data.csv")
d_nationality= {'UK': 0, 'USA': 1, 'N': 2}
df['Nationality'] = df['Nationality'].map(d_nationality)
d_go= {'YES': 1, 'NO': 0}
df['Go'] = df['Go'].map(d_go)
features = ['Age', 'Experience', 'Rank', 'Nationality']
X = df[features]
y = df['Go']
clf=DecisionTreeClassifier()
clf=clf.fit(X,y)
plt.figure(figsize=(20,10))
tree.plot_tree(clf,feature_names=features,class_names=['No','Yes'],filled=True)
plt.savefig('decision_tree.png')
dataset:-
Age Experience Rank Nationality Go
36 10 9 UK NO
42 12 4 USA NO
23 4 6 N NO
52 4 4 USA NO
43 21 8 USA YES
44 14 5 UK NO
66 3 7 N YES
35 14 9 UK YES
52 13 7 N YES
35 5 9 N YES
24 3 5 USA NO
18 3 7 UK YES
45 9 9 UK YES
Output:-
Task 2 - Back Propagation Algorithm.
Code:-
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0)
# maximum of X array longitudinally
y = y/100
#Sigmoid Function
def sigmoid (x): return 1/(1 + np.exp(-x))
#Derivative of Sigmoid Function
def derivatives_sigmoid(x): return x * (1 - x)
#Variable initialization
epoch=5000
#Setting training iterations
lr=0.1
#Setting learning rate
inputlayer_neurons = 2
#number of features in data set
hiddenlayer_neurons = 3
#number of hidden layers neurons
output_neurons = 1
#number of neurons at output layer
#weight and bias initialization
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons))
bh=np.random.uniform(size=(1,hiddenlayer_neurons))
wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
#draws a random range of numbers uniformly of dim x*y for i in range(epoch):
#Forward Propogation
hinp1=np.dot(X,wh)
hinp=hinp1 + bh
hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout)
outinp= outinp1+ bout
output = sigmoid(outinp)
#Backpropagation
EO = y-output
outgrad = derivatives_sigmoid(output)
d_output = EO* outgrad
EH = d_output.dot(wout.T)
#how much hidden layer wts contributed to error
hiddengrad = derivatives_sigmoid(hlayer_act)
d_hiddenlayer = EH * hiddengrad
# dotproduct of nextlayererror and currentlayerop
wout += hlayer_act.T.dot(d_output) *lr
wh+= X.T.dot(d_hiddenlayer) *lr
print("Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)
Output:-
Input:
[[0.66666667 1. ]
[0.33333333 0.55555556]
[1. 0.66666667]]
Actual Output:
[[0.92]
[0.86]
[0.89]]
Predicted Output:
[[0.92745804]
[0.91954311]
[0.92598481]]
Task 5 - Naïve Bayesian Classifier for Text Classification
Code:-
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
# Define the column names before reading the CSV file
names = ['message', 'label']
msg = pd.read_csv(r"exp5.csv", names=names) # Pass names as an argument
print("Total Instances of Dataset: ", msg.shape[0])
# Map the labels to numerical values, handling unknown labels with 'neg' (or any default)
msg['labelnum'] = msg.label.map({'pos': 1, 'neg': 0}).fillna(0) # Fill NaN with 0
# Rest of your code remains the same...
X = msg.message
y = msg.labelnum
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
count_v = CountVectorizer()
Xtrain_dm = count_v.fit_transform(Xtrain)
Xtest_dm=count_v.transform(Xtest)
df=pd.DataFrame(Xtrain_dm.toarray(),columns=count_v.get_feature_names_out())
clf = MultinomialNB()
clf.fit(Xtrain_dm, ytrain)
pred = clf.predict(Xtest_dm)
print('Accuracy Metrics:')
print('Accuracy: ', accuracy_score(ytest, pred))
print('Recall: ', recall_score(ytest, pred))
print('Precision: ', precision_score(ytest, pred))
print('Confusion Matrix: \n', confusion_matrix(ytest, pred))
Output:-
Total Instances of Dataset: 12
Accuracy Metrics:
Accuracy: 0.6666666666666666
Recall: 0.5
Precision: 1.0
Confusion Matrix:
[[1 0]
[1 1]]
Task6-Bayesian network considering medical data.
Code:-
import pandas as pd
from pgmpy.estimators import ExpectationMaximization
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
data = pd.read_csv(r"ds4.csv")
heart_disease = pd.DataFrame(data)
print(heart_disease)
# Corrected the typo 'cholesterol' to 'cholestrol' to match the column name in the dataset
model = BayesianNetwork([
('age', 'Lifestyle'),
('Gender', 'Lifestyle'),
('Family', 'heartdisease'),
('diet', 'cholesterol'),
('Lifestyle', 'diet'),
('cholesterol', 'heartdisease'), # Changed 'cholesterol' to 'cholestrol'
('diet', 'cholesterol') # Changed 'cholesterol' to 'cholestrol' to ensure consistency
])
# If there are still latent variables after correcting typos, you might need to use Expectation-Maximization (EM)
instead of MLE:
# from pgmpy.estimators import ExpectationMaximization
# model.fit(heart_disease, estimator=ExpectationMaximization)
model.fit(heart_disease, estimator=ExpectationMaximization) # Using MLE after correcting the typo
HeartDisease_infer = VariableElimination(model)
print('For Age enter SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3, Teen:4')
print('For Gender enter Male:0, Female:1')
print('For Family History enter Yes:1, No:0')
print('For Diet enter High:0, Medium:1')
print('for LifeStyle enter Athlete:0, Active:1, Moderate:2, Sedentary:3')
print('for Cholesterol enter High:0, BorderLine:1, Normal:2')
q = HeartDisease_infer.query(variables=['heartdisease'], evidence={
'age': int(input('Enter Age: ')),
'Gender': int(input('Enter Gender: ')),
'Family': int(input('Enter Family History: ')),
'diet': int(input('Enter Diet: ')),
'Lifestyle': int(input('Enter Lifestyle: ')),
'cholesterol': int(input('Enter Cholesterol: '))
})
print(q)
Dataset:-
Gende Famil Lifestyl cholester heartdisea
age r y diet e ol se
0 0 1 1 3 0 1
0 1 1 1 3 0 1
1 0 0 0 2 1 1
4 0 1 1 3 2 0
3 1 1 0 0 2 0
2 0 1 1 1 0 1
4 0 1 0 2 0 1
0 0 1 1 3 0 1
3 1 1 0 0 2 0
1 1 0 0 0 2 1
4 1 0 1 2 0 1
4 0 1 1 3 2 0
2 1 0 0 0 0 0
2 0 1 1 1 0 1
3 1 1 0 0 1 0
0 0 1 0 0 2 1
1 1 0 1 2 1 1
3 1 1 1 0 1 0
4 0 1 1 3 2 0
Output:-
For Age enter SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3, Teen:4
For Gender enter Male:0, Female:1
For Family History enter Yes:1, No:0
For Diet enter High:0, Medium:1
for LifeStyle enter Athlete:0, Active:1, Moderate:2, Sedentary:3
for Cholesterol enter High:0, BorderLine:1, Normal:2
Enter Age: 2
Enter Gender: 1
Enter Family History: 0
Enter Diet: 1
Enter Lifestyle: 2
Enter Cholestrol: 1
+-----------------+---------------------+
| heartdisease | phi(heartdisease) |
+=================+=====================+
| heartdisease(0) | 0.5000 |
+-----------------+---------------------+
| heartdisease(1) | 0.5000 |
+-----------------+---------------------+
Task 7 - Expectation Maximization Algorithm
Code:-
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_iris
import sklearn.metrics as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=load_iris()
# print(dataset)
X=pd.DataFrame(dataset.data)
X.columns=['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y=pd.DataFrame(dataset.target)
y.columns=['Targets']
# print(X)
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])
# REAL PLOT
plt.subplot(1,3,1)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y.Targets],s=40)
plt.title('Real')
# K-PLOT
plt.subplot(1,3,2)
model=KMeans(n_clusters=3)
model.fit(X)
predY=np.choose(model.labels_,[0,1,2]).astype(np.int64)
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[predY],s=40)
plt.title('KMeans')
Output:
Task 8 - Principle Component Analysis for Dimensionality
Reduction.
Code:-
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig
# define a small 3×2 matrix
matrix = array([[5, 6], [8, 10], [12, 18]])
print("original Matrix: ")
print(matrix)
# calculate the mean of each column
Mean_col = mean(matrix.T, axis=1)
print("Mean of each column: ")
print(Mean_col)
# center columns by subtracting column means
Centre_col = matrix - Mean_col
print("Covariance Matrix: ")
print(Centre_col)
# calculate covariance matrix of centered matrix
cov_matrix = cov(Centre_col.T)
print(cov_matrix)
# eigendecomposition of covariance matrix
values, vectors = eig(cov_matrix)
print("Eigen vectors: ",vectors)
print("Eigen values: ",values)
# project data on the new axes
projected_data = vectors.T.dot(Centre_col.T)
print(projected_data.T)
Output:
original Matrix:
[[ 5 6]
[ 8 10]
[12 18]]
Mean of each column:
[ 8.33333333 11.33333333]
Covariance Matrix:
[[-3.33333333 -5.33333333]
[-0.33333333 -1.33333333]
[ 3.66666667 6.66666667]]
[[12.33333333 21.33333333]
[21.33333333 37.33333333]]
Eigen vectors: [[-0.86762506 -0.49721902]
[ 0.49721902 -0.86762506]]
Eigen values: [ 0.10761573 49.55905094]
[[ 0.24024879 6.28473039]
[-0.37375033 1.32257309]
[ 0.13350154 -7.60730348]]