ML 3
ML 3
1: Develop a program to create histograms for all numerical features and analyze the distribution of each
feature. Generate box plots for all numerical features and identify any outliers. Use California Housing
dataset.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df.head()
print(df.describe())
plt.figure(figsize=(12, 8))
for i, col in enumerate(df.columns):
plt.subplot(3, 3, i + 1)
sns.boxplot(y=df[col])
plt.title(f'Box Plot of {col}')
plt.xlabel("")
plt.tight_layout()
plt.show()
Data Set :
File name: California_housing dataset from sklearn
Output
2: Develop a program to Compute the correlation matrix to understand the relationships between pairs of
features. Visualize the correlation matrix using a heatmap to know which variables have strong
positive/negative correlations. Create a pair plot to visualize pairwise relationships between features. Use
California Housing dataset.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
"""
correlation_matrix = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of California Housing Features")
plt.show()
Data Set:
File name: California_housing dataset from sklearn
Output:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
iris = datasets.load_iris()
X = iris.data # Features (4-dimensional)
y = iris.target # Target labels (Setosa, Versicolor, Virginica)
"""# Standardize the features (PCA works better with standardized data)"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PC1', y='PC2', hue=df_pca['Target'], palette='viridis', data=df_pca, alpha=0.8)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA: Iris Dataset (4D → 2D)")
plt.legend(title="Species", labels=iris.target_names)
plt.show()
Data Set:
File name: Iris dataset from sklearn
Output:
4: For a given set of training data examples stored in a .CSV file, implement and demonstrate the Find-S
algorithm to output a description of the set of all hypotheses consistent with the training examples..
data = pd.read_csv(‘playing_tenis_dataset.csv')
data.head()
data.columns
import pandas as pd
def find_s_algorithm(file_path):
# Load the dataset
data = pd.read_csv(file_path)
return hypothesis
# Example usage:
file_path = "playing_tenis_dataset.csv"
final_hypothesis = find_s_algorithm(file_path)
print("Final Hypothesis:", final_hypothesis)
Data Set:
File name: playing tenis Data
Output:
5. Develop a program to implement k-Nearest Neighbour algorithm to classify the randomly generated
100
values of x in the range of [0,1]. Perform the following based on dataset generated.
a. Label the first 50 points {x1,……,x50} as follows: if (xi ≤ 0.5), then xi ∊ Class1, else xi ∊ Class1
b. Classify the remaining points, x51,……,x100 using KNN. Perform this for k=1,2,3,4,5,20,30
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x[:50], labels.ravel()) # Train using first 50 points
predictions = knn.predict(x[50:]) # Predict the remaining 50 points
classification_results[k] = predictions
Data Set:
Filename: random numbers from 1 to 100
Output
6. Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select
appropriate data set for your experiment and draw graphs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
y_pred = np.zeros(len(x_test))
for i in range(len(x_test)):
x_i = x_test_aug[i]
return y_pred
# Load dataset
df = pd.read_csv("diabetes.csv") # Ensure the file is in the same directory or provide the correct path
plt.xlabel("Standardized BMI")
plt.ylabel("Diabetes Outcome (0 or 1)")
plt.legend()
plt.title("Locally Weighted Regression on Pima Diabetes Dataset")
plt.show()
Data Set:
Filename: diabetics.csv
Output
7a. Develop a program to demonstrate the working of Linear Regression and Polynomial Regression. Use
Boston Housing Dataset for Linear Regression and Auto MPG Dataset (for vehicle fuel efficiency
prediction) for Polynomial Regression..
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
df = pd.read_csv("/content/drive/MyDrive/MachineLearningLab_2025/BostonHousing.csv")
df.head()
df.columns
df.info()
df = df.dropna()
Dataset:
Output:
7b. Develop a program to demonstrate the working of Linear Regression and Polynomial Regression. Use
Boston Housing Dataset for Linear Regression and Auto MPG Dataset (for vehicle fuel efficiency
prediction) for Polynomial Regression..
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, r2_score
auto_mpg.head()
auto_mpg.info()
auto_mpg = auto_mpg.dropna()
poly_reg = LinearRegression()
poly_reg.fit(X_train_auto_poly, y_train_auto)
y_pred_auto = poly_reg.predict(X_test_auto_poly)
Dataset:
Output:
8: Develop a program to demonstrate the working of the decision tree algorithm. Use Breast Cancer Data
set for building the decision tree and apply this knowledge to classify a new sample.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
df = datasets.load_breast_cancer()
df
data =pd.DataFrame(df.data,columns=df.feature_names)
data.head()
X = data # Features
y = df.target # Target (Malignant: 0, Benign: 1)
"""# Split the dataset into Training and Testing sets (80% train, 20% test)"""
y_pred = clf.predict(X_test)
new_sample = np.array([[15.0, 14.5, 85.0, 600.0, 0.09, 0.08, 0.05, 0.05, 0.17, 0.06,
0.35, 1.2, 2.5, 30.0, 0.007, 0.02, 0.02, 0.01, 0.02, 0.003,
16.0, 18.0, 110.0, 800.0, 0.14, 0.20, 0.20, 0.12, 0.30, 0.08]])
prediction = clf.predict(new_sample)
print(f"New Sample Prediction: {'Malignant' if prediction[0] == 0 else 'Benign'}")
Output:
9.Develop a program to implement the Naive Bayesian classifier considering Olivetti Face Data set for
training. Compute the accuracy of the classifier, considering a few test data sets.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes Classifier Accuracy: {accuracy:.2f}')
Dataset:
Output:
10. Develop a program to implement k-means clustering using Wisconsin Breast Cancer data set and
visualize the clustering result..
import numpy as np
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
kmeans_labels = kmeans.fit_predict(X_pca)
gmm_labels = gmm.fit_predict(X_pca)
plt.show()
Dataset:
Output :