BCSL606 Machine Learning Lab
BCSL606 Machine Learning Lab
2 Develop a program to Compute the correlation matrix to understand the relationships between pairs of
features. Visualize the correlation matrix using a heatmap to know which variables have strong
positive/negative correlations. Create a pair plot to visualize pairwise relationships between features. Use
California Housing dataset.
3 Develop a program to implement Principal Component Analysis (PCA) for reducing the dimensionality of the
Iris dataset from 4 features to 2.
4 For a given set of training data examples stored in a .CSV file, implement and demonstrate the Find-S
algorithm to output a description of the set of all hypotheses consistent with the training examples.
5 Develop a program to implement k-Nearest Neighbour algorithm to classify the randomly generated 100 values
of x in the range of [0,1]. Perform the following based on dataset generated.
1. Label the first 50 points {x1,……,x50} as follows: if (xi ≤ 0.5), then xi ∊ Class1, else xi ∊ Class1
2. Classify the remaining points, x51,……,x100 using KNN. Perform this for k=1,2,3,4,5,20,30
6 Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select
appropriate data set for your experiment and draw graphs
7 Develop a program to demonstrate the working of Linear Regression and Polynomial Regression. Use Boston
Housing Dataset for Linear Regression and Auto MPG Dataset (for vehicle fuel efficiency prediction) for
Polynomial Regression.
8 Develop a program to demonstrate the working of the decision tree algorithm. Use Breast Cancer Data set for
building the decision tree and apply this knowledge to classify a new sample.
9 Develop a program to implement the Naive Bayesian classifier considering Olivetti Face Data set for training.
Compute the accuracy of the classifier, considering a few test data sets.
10 Develop a program to implement k-means clustering using Wisconsin Breast Cancer data set and visualize the
clustering result.
PROGRAM 1
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
Import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Load the Iris dataset
iris = load_iris()
data = iris.data
labels = iris.target
label_names = iris.target_names
# Convert to a DataFrame for better visualization
iris_df = pd.DataFrame(data, columns=iris.feature_names)
# Perform PCA to reduce dimensionality to 2
pca = PCA(n_components=2)
data_reduced = pca.fit_transform(data)
# Create a DataFrame for the reduced data
reduced_df = pd.DataFrame(data_reduced, columns=[‘Principal Component 1’,
‘Principal Component 2’])
reduced_df[‘Label’] = labels
# Plot the reduced data
plt.figure(figsize=(8, 6))
colors = [‘r’, ‘g’, ‘b’]
for I, label in enumerate(np.unique(labels)):
plt.scatter(
reduced_df[reduced_df[‘Label’] == label][‘Principal Component 1’],
reduced_df[reduced_df[‘Label’] == label][‘Principal Component 2’],
label=label_names[label],
color=colors[i]
)
plt.title(‘PCA on Iris Dataset’)
plt.xlabel(‘Principal Component 1’)
plt.ylabel(‘Principal Component 2’)
plt.legend()
plt.grid()
plt.show()
OUTPUT
PROGRAM-4
For a given set of training data examples stored in a .CSV file, implement
and demonstrate the Find-S algorithm to output a description of the set of
all hypotheses consistent with the training examples.
import pandas as pd
def find_s_algorithm(file_path):
data = pd.read_csv(file_path)
print("Training data:")
print(data)
attributes = data.columns[:-1]
class_label = data.columns[-1]
hypothesis = ['?' for _ in attributes]
for index, row in data.iterrows():
if row[class_label] == 'Yes':
for i, value in enumerate(row[attributes]):
if hypothesis[i] == '?' or hypothesis[i] == value:
hypothesis[i] = value
else:
hypothesis[i] = '?'
return hypothesis
file_path = 'training_data.csv'
hypothesis = find_s_algorithm(file_path)
print("\nThe final hypothesis is:", hypothesis)
OUTPUT
Training data:
Outlook Temperature Humidity Windy PlayTennis
0 Sunny Hot High False No
1 Sunny Hot High True No
2 Overcast Hot High False Yes
3 Rain Cold High False Yes
4 Rain Cold High True No
5 Overcast Hot High True Yes
6 Sunny Hot High False No
The final hypothesis is: ['Overcast', 'Hot', 'High', '?']
PROGRAM=5
Develop a program to implement k-Nearest Neighbour algorithm to classify
the randomly generated 100 values
of x in the range of [0,1]. Perform the following based on dataset generated.
1. Label the first 50 points {x1,……,x50} as follows: if (xi ≤ 0.5), then xi ∊
Class1, else xi ∊ Class1
2. Classify the remaining points, x51,……,x100 using KNN. Perform this
for k=1,2,3,4,5,20,30
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
def generate_data():
np.random.seed(42) # For reproducibility
x = np.random.rand(100) # Generate 100 random values in [0,1]
labels = np.array(["Class1" if xi <= 0.5 else "Class2" for xi in x[:50]]) #
Label first 50 points
return x, labels
def linear_regression_california():
housing = fetch_california_housing(as_frame=True)
X = housing.data[["AveRooms"]]
y = housing.target
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
def polynomial_regression_auto_mpg():
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-
mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower",
"weight", "acceleration", "model_year", "origin"]
data = pd.read_csv(url, sep='\s+', names=column_names, na_values="?")
data = data.dropna()
X = data["displacement"].values.reshape(-1, 1)
y = data["mpg"].values
poly_model = make_pipeline(PolynomialFeatures(degree=2),
StandardScaler(), LinearRegression())
poly_model.fit(X_train, y_train)
y_pred = poly_model.predict(X_test)
if __name__ == "__main__":
print("Demonstrating Linear Regression and Polynomial Regression\n")
linear_regression_california()
polynomial_regression_auto_mpg()
OUTPUT
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
data = load_breast_cancer()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
new_sample = np.array([X_test[0]])
prediction = clf.predict(new_sample)
prediction_class = "Benign" if prediction == 1 else "Malignant"
print(f"Predicted Class for the new sample: {prediction_class}")
plt.figure(figsize=(12,8))
tree.plot_tree(clf, filled=True, feature_names=data.feature_names,
class_names=data.target_names)
plt.title("Decision Tree - Breast Cancer Dataset")
plt.show()
OUTPUT
PROGRAM 9
Develop a program to implement the Naive Bayesian classifier considering
Olivetti Face Data set for training. Compute the accuracy of the classifier,
considering a few test data sets.
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report,
confusion_matrix
import matplotlib.pyplot as plt
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report
data = load_breast_cancer()
X = data.data
y = data.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Confusion Matrix:")
print(confusion_matrix(y, y_kmeans))
print("\nClassification Report:")
print(classification_report(y, y_kmeans))
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100,
edgecolor='black', alpha=0.7)
plt.title('K-Means Clustering of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Cluster")
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='True Label', palette='coolwarm',
s=100, edgecolor='black', alpha=0.7)
plt.title('True Labels of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="True Label")
plt.show()
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='Set1', s=100,
edgecolor='black', alpha=0.7)
centers = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers[:, 0], centers[:, 1], s=200, c='red', marker='X',
label='Centroids')
plt.title('K-Means Clustering with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title="Cluster")
plt.show()
OUTPUT
Confusion Matrix:
[[175 37]
[ 13 344]]
Classification Report:
precision recall f1-score support